def test_transfac_pssms(): transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) for p in transfac_pssms: print p, biopsy.get_transfac_pssm_name( p ) print 'Have', len( transfac_pssms ), 'transfac pssms' for acc in [ 'R19099', 'M00418' ]: print acc, biopsy.get_transfac_pssm_name( acc ) biopsy.get_pssm( acc ) print 'Under pssm' for under_pssm in biopsy.get_pssm( acc ).get_dist( True, False ): print under_pssm print 'Under background' for under_background in biopsy.get_pssm( acc ).get_dist( False, False ): print under_background
def test_transfac_pssms(): transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) for p in transfac_pssms: print p, biopsy.get_transfac_pssm_name(p) print 'Have', len(transfac_pssms), 'transfac pssms' for acc in ['R19099', 'M00418']: print acc, biopsy.get_transfac_pssm_name(acc) biopsy.get_pssm(acc) print 'Under pssm' for under_pssm in biopsy.get_pssm(acc).get_dist(True, False): print under_pssm print 'Under background' for under_background in biopsy.get_pssm(acc).get_dist(False, False): print under_background
def write_minimal_meme_matrix(out, acc): """ The minimal MEME format for a motif looks something like:: MOTIF crp letter-probability matrix: alength= 4 w= 19 nsites= 17 E= 4.1e-009 0.000000 0.176471 0.000000 0.823529 0.000000 0.058824 0.647059 0.294118 0.000000 0.058824 0.000000 0.941176 0.176471 0.000000 0.764706 0.058824 0.823529 0.058824 0.000000 0.117647 0.294118 0.176471 0.176471 0.352941 0.294118 0.352941 0.235294 0.117647 0.117647 0.235294 0.352941 0.294118 0.529412 0.000000 0.176471 0.294118 0.058824 0.235294 0.588235 0.117647 0.176471 0.235294 0.294118 0.294118 0.000000 0.058824 0.117647 0.823529 0.058824 0.882353 0.000000 0.058824 0.764706 0.000000 0.176471 0.058824 0.058824 0.882353 0.000000 0.058824 0.823529 0.058824 0.058824 0.058824 0.176471 0.411765 0.058824 0.352941 0.411765 0.000000 0.000000 0.588235 0.352941 0.058824 0.000000 0.588235 """ pssm_info = biopsy.get_pssm(acc) print >> out, ( "MOTIF %s %s\n" "letter-probability matrix: alength= 4 w= %d nsites= %d E= %e\n" "%s\n") % (biopsy.get_pssm_name(acc), acc, len( pssm_info.dists), pssm_info.sites, 0., "\n".join(' '.join( ("%.6f" % dist.get_freq(b)) for b in xrange(4)) for dist in pssm_info.dists))
def look_for_matrices(names): for name in names: print name for matrix, factor in find_matrices(name): print matrix.acc, matrix.name, factor.acc, factor.name logo(dist_for_pssm(biopsy.get_pssm(str(matrix.acc))), '%s-%s' % (name, matrix.acc), 'logos')
def write_minimal_meme_matrix(out, acc): """ The minimal MEME format for a motif looks something like:: MOTIF crp letter-probability matrix: alength= 4 w= 19 nsites= 17 E= 4.1e-009 0.000000 0.176471 0.000000 0.823529 0.000000 0.058824 0.647059 0.294118 0.000000 0.058824 0.000000 0.941176 0.176471 0.000000 0.764706 0.058824 0.823529 0.058824 0.000000 0.117647 0.294118 0.176471 0.176471 0.352941 0.294118 0.352941 0.235294 0.117647 0.117647 0.235294 0.352941 0.294118 0.529412 0.000000 0.176471 0.294118 0.058824 0.235294 0.588235 0.117647 0.176471 0.235294 0.294118 0.294118 0.000000 0.058824 0.117647 0.823529 0.058824 0.882353 0.000000 0.058824 0.764706 0.000000 0.176471 0.058824 0.058824 0.882353 0.000000 0.058824 0.823529 0.058824 0.058824 0.058824 0.176471 0.411765 0.058824 0.352941 0.411765 0.000000 0.000000 0.588235 0.352941 0.058824 0.000000 0.588235 """ pssm_info = biopsy.get_pssm(acc) print >> out, ("MOTIF %s %s\n" "letter-probability matrix: alength= 4 w= %d nsites= %d E= %e\n" "%s\n") % ( biopsy.get_pssm_name(acc), acc, len(pssm_info.dists), pssm_info.sites, 0.0, "\n".join(" ".join(("%.6f" % dist.get_freq(b)) for b in xrange(4)) for dist in pssm_info.dists), )
def test_pssm_pseudo_counts(): sascha_pssms = biopsy.SequenceVec() sascha_acc = 'M00975' # sascha_seq = 'gtaaaccaggctgcctGAgaacttgttgcgaatcc' sascha_seq = 'ttgttgcga' sascha_seq = 'ttgttgcaa' # plot_likelihoods( biopsy.get_pssm( 'M00975' ), 'M00975' ) # plot_likelihoods( biopsy.get_pssm( 'R02146' ), 'R02146' ) print 'Binding,Background,odds,p(binding),cumulative p(binding),Sequence' biopsy.PssmParameters.singleton().use_p_value = True; # biopsy.PssmParameters.singleton().binding_background_odds_prior = 1; for pc in [ 0.0, 0.25, 0.5, 1.0, 2.0 ]: # force cache load biopsy.get_pssm( sascha_acc ) biopsy.clear_pssm_cache() biopsy.PssmParameters.singleton().pseudo_counts = pc p = biopsy.get_pssm( sascha_acc ) score = biopsy.score_pssm( p.pssm, sascha_seq ) ( bind, back, cum_bind, cum_back, odds_ratio, cum_odds_ratio, p_bind, cum_p_bind, p_value_p_bind ) = biopsy.get_pssm_likelihoods_for_score( p, score ) print pc, print \ '%f,%f,%f,%f,%f,%f,%f' \ % \ ( bind, back, cum_bind, cum_back, p_bind, cum_p_bind, p_value_p_bind ) biopsy.plot_likelihoods( p, sascha_acc + ': ' + str( pc ), score ) # print 'Trying with standard distributions' # biopsy.PssmParameters.singleton().use_cumulative_dists = False; # hits = biopsy.HitVec() # biopsy.score_pssm_on_sequence( sascha_acc, sascha_seq, 0.001, hits ) # print hits print 'Trying with cumulative distributions' biopsy.PssmParameters.singleton().use_cumulative_dists = True; hits = biopsy.HitVec() biopsy.score_pssm_on_sequence( sascha_acc, sascha_seq, 0.001, hits ) print hits print
def test_likelihoods_indices(): p = biopsy.get_pssm( 'M00975' ) dist = p.get_dist( True, False ) for s in range( len(dist) ): score = float(s)/float(len(dist) - 1) idx = biopsy.get_likelihood_index( len(dist), score ) print idx, score for score in [ 0.98, 0.99, 1.0 ]: print score, biopsy.get_likelihood_index( len(dist), score )
def test_likelihoods_indices(): p = biopsy.get_pssm('M00975') dist = p.get_dist(True, False) for s in range(len(dist)): score = float(s) / float(len(dist) - 1) idx = biopsy.get_likelihood_index(len(dist), score) print idx, score for score in [0.98, 0.99, 1.0]: print score, biopsy.get_likelihood_index(len(dist), score)
def test_pssm_pseudo_counts(): sascha_pssms = biopsy.SequenceVec() sascha_acc = 'M00975' # sascha_seq = 'gtaaaccaggctgcctGAgaacttgttgcgaatcc' sascha_seq = 'ttgttgcga' sascha_seq = 'ttgttgcaa' # plot_likelihoods( biopsy.get_pssm( 'M00975' ), 'M00975' ) # plot_likelihoods( biopsy.get_pssm( 'R02146' ), 'R02146' ) print 'Binding,Background,odds,p(binding),cumulative p(binding),Sequence' biopsy.PssmParameters.singleton().use_p_value = True # biopsy.PssmParameters.singleton().binding_background_odds_prior = 1; for pc in [0.0, 0.25, 0.5, 1.0, 2.0]: # force cache load biopsy.get_pssm(sascha_acc) biopsy.clear_pssm_cache() biopsy.PssmParameters.singleton().pseudo_counts = pc p = biopsy.get_pssm(sascha_acc) score = biopsy.score_pssm(p.pssm, sascha_seq) (bind, back, cum_bind, cum_back, odds_ratio, cum_odds_ratio, p_bind, cum_p_bind, p_value_p_bind) = biopsy.get_pssm_likelihoods_for_score(p, score) print pc, print \ '%f,%f,%f,%f,%f,%f,%f' \ % \ ( bind, back, cum_bind, cum_back, p_bind, cum_p_bind, p_value_p_bind ) biopsy.plot_likelihoods(p, sascha_acc + ': ' + str(pc), score) # print 'Trying with standard distributions' # biopsy.PssmParameters.singleton().use_cumulative_dists = False; # hits = biopsy.HitVec() # biopsy.score_pssm_on_sequence( sascha_acc, sascha_seq, 0.001, hits ) # print hits print 'Trying with cumulative distributions' biopsy.PssmParameters.singleton().use_cumulative_dists = True hits = biopsy.HitVec() biopsy.score_pssm_on_sequence(sascha_acc, sascha_seq, 0.001, hits) print hits print
def test_pssm_score(): # 'V$AP1_Q2' pssm_acc = biopsy.get_transfac_pssm_accession('V$DEAF1_01') pssm_info = biopsy.get_pssm(pssm_acc) # print pssm_info.pssm seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' for i in range(len(seq) - len(pssm_info.pssm) + 1): s = biopsy.score_pssm(pssm_info.pssm, seq[i:]) p_binding = biopsy.get_p_binding( biopsy.get_odds_ratio(s, pssm_info.get_dist(True, False), pssm_info.get_dist(False, False))) if p_binding > 0.05: print i, s, p_binding result = biopsy.HitVec() p_binding = biopsy.score_pssm_on_sequence(pssm_acc, seq, 0.05, result) print 'Got', len(result), 'hits from', len(seq), 'bases' print p_binding
def test_pssm_score(): # 'V$AP1_Q2' pssm_acc = biopsy.get_transfac_pssm_accession( 'V$DEAF1_01' ); pssm_info = biopsy.get_pssm( pssm_acc ) # print pssm_info.pssm seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' for i in range( len( seq ) - len( pssm_info.pssm ) + 1 ): s = biopsy.score_pssm( pssm_info.pssm, seq[i:] ) p_binding = biopsy.get_p_binding( biopsy.get_odds_ratio( s, pssm_info.get_dist( True, False ), pssm_info.get_dist( False, False ) ) ) if p_binding > 0.05: print i, s, p_binding result = biopsy.HitVec() p_binding = biopsy.score_pssm_on_sequence( pssm_acc, seq, 0.05, result ) print 'Got', len( result ), 'hits from', len( seq ), 'bases' print p_binding
def look_for_matrices(names): for name in names: print name for matrix, factor in find_matrices(name): print matrix.acc, matrix.name, factor.acc, factor.name logo(dist_for_pssm(biopsy.get_pssm(str(matrix.acc))), "%s-%s" % (name, matrix.acc), "logos")
def logo_for_pssm_name(pssm_name): import biopsy logo_for_pssm(biopsy.get_pssm(pssm_name), pssm_name)