Beispiel #1
0
 def score(self, test_case):
     "Score the test case and return the hits."
     record, seq, pssms = test_case
     hits = B.score_pssms_on_sequence(make_sequence_vec(pssms),
                                      str(seq.seq),
                                      threshold=self.min_threshold)
     return hits
Beispiel #2
0
def test_pssm_distributions():
    pssm_acc = 'M00750'
    transfac_pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    print 'Got', len(transfac_pssms), 'pssms'

    # initialise the distributions
    score_dists = {}
    for tp in transfac_pssms:
        score_dists[tp] = {}
        for k in range(1, 100):
            score_dists[tp][k] = 0
    score_dists['all'] = {}
    for k in range(1, 100):
        score_dists['all'][k] = 0

    # parse the mouse chromosome, score the pssms and fill in the distributions
    bases = 0
    seq = ''
    start = time.clock()
    for line in open(
            'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa',
            'r'):
        if line.startswith('>'): continue
        seq += line.strip('\r\n').replace('N', '')
        # Take 1kb at a time
        if len(seq) >= 1000:
            hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.0)
            for h in hits:
                bin = int(100.0 * h.p_binding)
                if 0 != bin:
                    # print h.binder, bin
                    score_dists[h.binder][bin] += 1
                    score_dists['all'][bin] += 1
            bases += len(seq)
            seq = ''
            print 'Bases:', bases
        if bases >= 2800000: break
    elapsed = time.clock() - start
    print 'Scored', len(
        transfac_pssms), 'pssms on', bases, 'bases in', elapsed, 'seconds'
    print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases
    print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24
    print 'Estimate for # bases/hour:', bases * 3600 / elapsed

    # remember scores for later
    f = open('pssm_p_binding_dists.txt', 'w')
    print 'Writing pssm p(binding) distributions to:', f
    pickle.dump(score_dists, f)
    f.close()
Beispiel #3
0
def test_score_pssms():
    # 'V$AP1_Q2'
    transfac_pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    print 'Got', len(transfac_pssms), 'pssms'
    seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga'
    # seq = 'acatcat'
    # seq = 'gat'
    # hits = biopsy.HitVec()
    hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.05)
    print hits
    print 'score_pssm_on_sequence: Got', len(hits), 'hits from', len(
        seq), 'bases'
    hits = biopsy.analyse(seq, 0.05)
    # print hits
    print 'analyse: Got', len(hits), 'hits from', len(seq), 'bases'
Beispiel #4
0
def test_pssm_distributions():
    pssm_acc = 'M00750'
    transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    print 'Got', len( transfac_pssms ), 'pssms'

    # initialise the distributions
    score_dists = { }
    for tp in transfac_pssms:
        score_dists[ tp ] = { }
        for k in range(1,100):
            score_dists[ tp ][ k ] = 0
    score_dists[ 'all' ] = { }
    for k in range(1,100):
        score_dists[ 'all' ][ k ] = 0

    # parse the mouse chromosome, score the pssms and fill in the distributions
    bases = 0
    seq = ''
    start = time.clock()
    for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r' ):
        if line.startswith( '>' ): continue
        seq += line.strip( '\r\n' ).replace( 'N', '' )
        # Take 1kb at a time
        if len( seq ) >= 1000:
            hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.0 )
            for h in hits:
                bin = int( 100.0 * h.p_binding )
                if 0 != bin:
                    # print h.binder, bin
                    score_dists[ h.binder ][ bin ] += 1
                    score_dists[ 'all' ][ bin ] += 1
            bases += len( seq )
            seq = ''
            print 'Bases:', bases
        if bases >= 2800000: break
    elapsed = time.clock() - start
    print 'Scored', len( transfac_pssms ), 'pssms on', bases, 'bases in', elapsed, 'seconds'
    print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases
    print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24
    print 'Estimate for # bases/hour:', bases * 3600 / elapsed

    # remember scores for later
    f = open( 'pssm_p_binding_dists.txt', 'w' )
    print 'Writing pssm p(binding) distributions to:', f
    pickle.dump(score_dists, f)
    f.close()
Beispiel #5
0
def test_score_pssms():
    # 'V$AP1_Q2'
    transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    print 'Got', len( transfac_pssms ), 'pssms'
    seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga'
    # seq = 'acatcat'
    # seq = 'gat'
    # hits = biopsy.HitVec()
    hits = biopsy.score_pssms_on_sequence(
            transfac_pssms,
            seq,
            0.05 )
    print hits
    print 'score_pssm_on_sequence: Got', len( hits ), 'hits from', len( seq ), 'bases'
    hits = biopsy.analyse(
            seq,
            0.05)
    # print hits
    print 'analyse: Got', len( hits ), 'hits from', len( seq ), 'bases'
Beispiel #6
0
 def __call__(self, sequence):
     import biopsy
     import biopsy.analyse_remos.consolidate_hits as CH
     hits = biopsy.score_pssms_on_sequence(self.pssms, sequence.strip('Nn'), self.threshold)
     return CH.maximal_chain_hits(hits)
Beispiel #7
0
 def score(self, test_case):
     "Score the test case and return the hits."
     record, seq, pssms = test_case
     hits = B.score_pssms_on_sequence(make_sequence_vec(pssms), str(seq.seq), threshold=self.min_threshold)
     return hits
def score_pssms_on_record(pssms, record):
    import biopsy
    promoter_hits = biopsy.score_pssms_on_sequence(make_sequence_vec(pssms), record.seq.tostring())
    promoter_p_binding = calculate_p_binding_from_hits(promoter_hits)
    return promoter_p_binding