def score(self, test_case): "Score the test case and return the hits." record, seq, pssms = test_case hits = B.score_pssms_on_sequence(make_sequence_vec(pssms), str(seq.seq), threshold=self.min_threshold) return hits
def test_pssm_distributions(): pssm_acc = 'M00750' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) print 'Got', len(transfac_pssms), 'pssms' # initialise the distributions score_dists = {} for tp in transfac_pssms: score_dists[tp] = {} for k in range(1, 100): score_dists[tp][k] = 0 score_dists['all'] = {} for k in range(1, 100): score_dists['all'][k] = 0 # parse the mouse chromosome, score the pssms and fill in the distributions bases = 0 seq = '' start = time.clock() for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r'): if line.startswith('>'): continue seq += line.strip('\r\n').replace('N', '') # Take 1kb at a time if len(seq) >= 1000: hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.0) for h in hits: bin = int(100.0 * h.p_binding) if 0 != bin: # print h.binder, bin score_dists[h.binder][bin] += 1 score_dists['all'][bin] += 1 bases += len(seq) seq = '' print 'Bases:', bases if bases >= 2800000: break elapsed = time.clock() - start print 'Scored', len( transfac_pssms), 'pssms on', bases, 'bases in', elapsed, 'seconds' print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24 print 'Estimate for # bases/hour:', bases * 3600 / elapsed # remember scores for later f = open('pssm_p_binding_dists.txt', 'w') print 'Writing pssm p(binding) distributions to:', f pickle.dump(score_dists, f) f.close()
def test_score_pssms(): # 'V$AP1_Q2' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) print 'Got', len(transfac_pssms), 'pssms' seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' # seq = 'acatcat' # seq = 'gat' # hits = biopsy.HitVec() hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.05) print hits print 'score_pssm_on_sequence: Got', len(hits), 'hits from', len( seq), 'bases' hits = biopsy.analyse(seq, 0.05) # print hits print 'analyse: Got', len(hits), 'hits from', len(seq), 'bases'
def test_pssm_distributions(): pssm_acc = 'M00750' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) print 'Got', len( transfac_pssms ), 'pssms' # initialise the distributions score_dists = { } for tp in transfac_pssms: score_dists[ tp ] = { } for k in range(1,100): score_dists[ tp ][ k ] = 0 score_dists[ 'all' ] = { } for k in range(1,100): score_dists[ 'all' ][ k ] = 0 # parse the mouse chromosome, score the pssms and fill in the distributions bases = 0 seq = '' start = time.clock() for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r' ): if line.startswith( '>' ): continue seq += line.strip( '\r\n' ).replace( 'N', '' ) # Take 1kb at a time if len( seq ) >= 1000: hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.0 ) for h in hits: bin = int( 100.0 * h.p_binding ) if 0 != bin: # print h.binder, bin score_dists[ h.binder ][ bin ] += 1 score_dists[ 'all' ][ bin ] += 1 bases += len( seq ) seq = '' print 'Bases:', bases if bases >= 2800000: break elapsed = time.clock() - start print 'Scored', len( transfac_pssms ), 'pssms on', bases, 'bases in', elapsed, 'seconds' print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24 print 'Estimate for # bases/hour:', bases * 3600 / elapsed # remember scores for later f = open( 'pssm_p_binding_dists.txt', 'w' ) print 'Writing pssm p(binding) distributions to:', f pickle.dump(score_dists, f) f.close()
def test_score_pssms(): # 'V$AP1_Q2' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) print 'Got', len( transfac_pssms ), 'pssms' seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' # seq = 'acatcat' # seq = 'gat' # hits = biopsy.HitVec() hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.05 ) print hits print 'score_pssm_on_sequence: Got', len( hits ), 'hits from', len( seq ), 'bases' hits = biopsy.analyse( seq, 0.05) # print hits print 'analyse: Got', len( hits ), 'hits from', len( seq ), 'bases'
def __call__(self, sequence): import biopsy import biopsy.analyse_remos.consolidate_hits as CH hits = biopsy.score_pssms_on_sequence(self.pssms, sequence.strip('Nn'), self.threshold) return CH.maximal_chain_hits(hits)
def score_pssms_on_record(pssms, record): import biopsy promoter_hits = biopsy.score_pssms_on_sequence(make_sequence_vec(pssms), record.seq.tostring()) promoter_p_binding = calculate_p_binding_from_hits(promoter_hits) return promoter_p_binding