def euclidean_distance(sites_a, sites_b): """Euclidean distance between two sets of sites""" ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) def ed(cola, colb): return math.sqrt(sum((cola[l] - colb[l])**2 for l in "ACGT")) return sum(ed(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
def euclidean_distance(sites_a, sites_b): """Euclidean distance between two sets of sites""" ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) def ed(cola, colb): return math.sqrt(sum((cola[l] - colb[l])**2 for l in "ACGT")) return sum(ed(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
def pearson_correlation_coefficient(sites_a, sites_b): """PEarson correlation coefficient""" def pcc(cola, colb): cola_avg = sum(cola[l] for l in "ACTG") / 4.0 colb_avg = sum(colb[l] for l in "ACTG") / 4.0 return (sum(((cola[l]-cola_avg) * (colb[l]-colb_avg)) for l in "ACTG") / math.sqrt(sum((cola[l]-cola_avg)**2 for l in "ACTG") * sum((colb[l]-colb_avg)**2 for l in "ACTG"))) ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) return sum(pcc(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
def kullback_leibler_divergence(sites_a, sites_b): """Kullback-Leibler divergence between two sets of sites""" def safe_log2(x): return math.log(x,2) if x != 0 else 0.0 def kl(cola, colb): return (sum(cola[l] * safe_log2(cola[l] / colb[l]) for l in "ACTG") + sum(colb[l] * safe_log2(colb[l] / cola[l]) for l in "ACTG")) / 2.0 ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) return sum(kl(cola, colb) for (cola,colb) in zip(ma.pwm(), mb.pwm()))
def pearson_correlation_coefficient(sites_a, sites_b): """PEarson correlation coefficient""" def pcc(cola, colb): cola_avg = sum(cola[l] for l in "ACTG") / 4.0 colb_avg = sum(colb[l] for l in "ACTG") / 4.0 return (sum(((cola[l] - cola_avg) * (colb[l] - colb_avg)) for l in "ACTG") / math.sqrt( sum((cola[l] - cola_avg)**2 for l in "ACTG") * sum( (colb[l] - colb_avg)**2 for l in "ACTG"))) ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) return sum(pcc(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
def kullback_leibler_divergence(sites_a, sites_b): """Kullback-Leibler divergence between two sets of sites""" def safe_log2(x): return math.log(x, 2) if x != 0 else 0.0 def kl(cola, colb): return (sum(cola[l] * safe_log2(cola[l] / colb[l]) for l in "ACTG") + sum(colb[l] * safe_log2(colb[l] / cola[l]) for l in "ACTG")) / 2.0 ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) return sum(kl(cola, colb) for (cola, colb) in zip(ma.pwm(), mb.pwm()))
def average_log_likelihood_ratio(sites_a, sites_b): """Average Log-likelihood ratio distance""" def safe_log2(x): return math.log(x,2) if x != 0 else 0.0 def allr(cola, colb, cnta, cntb): return (sum((cnta[l]*safe_log2(colb[l]/0.25) + cntb[l]*safe_log2(cola[l]/0.25)) for l in "ACTG") / sum(cnta[l] + cntb[l] for l in 'ACTG')) ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) # reformat biopython count matrices counts_a = [dict((l, ma.counts[l][i]) for l in "ACTG") for i in xrange(ma.length)] counts_b = [dict((l, mb.counts[l][i]) for l in "ACTG") for i in xrange(mb.length)] return sum(allr(cola, colb, cnta, cntb) for (cola,colb,cnta,cntb) in zip(ma.pwm(), mb.pwm(), counts_a, counts_b))
def average_log_likelihood_ratio(sites_a, sites_b): """Average Log-likelihood ratio distance""" def safe_log2(x): return math.log(x, 2) if x != 0 else 0.0 def allr(cola, colb, cnta, cntb): return (sum((cnta[l] * safe_log2(colb[l] / 0.25) + cntb[l] * safe_log2(cola[l] / 0.25)) for l in "ACTG") / sum(cnta[l] + cntb[l] for l in 'ACTG')) ma = bioutils.build_motif(sites_a) mb = bioutils.build_motif(sites_b) # reformat biopython count matrices counts_a = [ dict((l, ma.counts[l][i]) for l in "ACTG") for i in xrange(ma.length) ] counts_b = [ dict((l, mb.counts[l][i]) for l in "ACTG") for i in xrange(mb.length) ] return sum( allr(cola, colb, cnta, cntb) for (cola, colb, cnta, cntb) in zip(ma.pwm(), mb.pwm(), counts_a, counts_b))
def export_PSFM(meta_sites, **kwargs): """Export Position-Specific-Frequency-Matrix""" format = kwargs['format'] rows = export_base(meta_sites) aligned = bioutils.run_lasagna([m[0].site_instance for m in meta_sites]) motif = bioutils.build_motif(aligned) consensus = bioutils.degenerate_consensus(motif) TF_name= ','.join(set(row['curation__TF__name'] for row in rows)) sp = ','.join(set('_'.join(row['site_instance__genome__organism'].split()) for row in rows)) lines = [] if format == 'JASPAR': lines.append('> CollecTF_%s_%s' % (TF_name, sp)) lines.append('A [ %s ]' % (' '.join(map(str, motif.counts['A'])))) lines.append('C [ %s ]' % (' '.join(map(str, motif.counts['C'])))) lines.append('G [ %s ]' % (' '.join(map(str, motif.counts['G'])))) lines.append('T [ %s ]' % (' '.join(map(str, motif.counts['T'])))) elif format == 'TRANSFAC': lines.append('ID %s' % TF_name) lines.append('BF %s' % sp) lines.append('PO\tA\tC\tG\tT') lines.extend('%02d\t%d\t%d\t%d\t%d\t%s' % (po+1, motif.counts['A'][po], motif.counts['C'][po], motif.counts['G'][po], motif.counts['T'][po], consensus[po]) for po in range(motif.length)) lines.append('XX') elif format == 'raw_fasta': lines.append('>CollecTF_%s_%s' % (TF_name, sp)) lines.extend('%d\t%d\t%d\t%d' % (motif.counts['A'][po], motif.counts['C'][po], motif.counts['G'][po], motif.counts['T'][po]) for po in range(motif.length)) return '\n'.join(lines)
def export_PSFM(meta_sites, **kwargs): """Export Position-Specific-Frequency-Matrix""" format = kwargs['format'] rows = export_base(meta_sites) aligned = bioutils.run_lasagna([m[0].site_instance for m in meta_sites]) motif = bioutils.build_motif(aligned) consensus = bioutils.degenerate_consensus(motif) TF_name = ','.join(set(row['curation__TF__name'] for row in rows)) sp = ','.join( set('_'.join(row['site_instance__genome__organism'].split()) for row in rows)) lines = [] if format == 'JASPAR': lines.append('> CollecTF_%s_%s' % (TF_name, sp)) lines.append('A [ %s ]' % (' '.join(map(str, motif.counts['A'])))) lines.append('C [ %s ]' % (' '.join(map(str, motif.counts['C'])))) lines.append('G [ %s ]' % (' '.join(map(str, motif.counts['G'])))) lines.append('T [ %s ]' % (' '.join(map(str, motif.counts['T'])))) elif format == 'TRANSFAC': lines.append('ID %s' % TF_name) lines.append('BF %s' % sp) lines.append('PO\tA\tC\tG\tT') lines.extend( '%02d\t%d\t%d\t%d\t%d\t%s' % (po + 1, motif.counts['A'][po], motif.counts['C'][po], motif.counts['G'][po], motif.counts['T'][po], consensus[po]) for po in range(motif.length)) lines.append('XX') elif format == 'raw_fasta': lines.append('>CollecTF_%s_%s' % (TF_name, sp)) lines.extend('%d\t%d\t%d\t%d' % (motif.counts['A'][po], motif.counts['C'][po], motif.counts['G'][po], motif.counts['T'][po]) for po in range(motif.length)) return '\n'.join(lines)