Example #1
0
 def __init__(self, bridge_sets, ppi_network, transfac_2_network, tag='', pssm_names=None, directory='.'):
     self.bridge_sets = bridge_sets
     self.ppi_network = ppi_network
     self.transfac_2_network = transfac_2_network
     self.directory = directory
     self.tag = tag
     if None == pssm_names:
         self.pssm_names = biopsy.get_transfac_pssm_accessions(
           biopsy.get_default_transfac_pssm_filter()
         )
     else:
         self.pssm_names = biopsy.SequenceVec()
         for name in pssm_names:
             self.pssm_names.append(name)
     self.sequence_centre_key_regex = re.compile("[Mm]ouse")
     self.default_threshold = 0.01
     self.override_thresholds = {}
     self.default_phylo_threshold = 0.001
     self.override_phylo = {}
     self.use_max_chain = False
     remo_universe = set(chain(*list(names for s, names in bridge_sets.iteritems())))
     self.remos = dict(
             (
                     name,
                     Remo(
                      name,
                      self.sequence_centre_key_regex,
                      self.threshold(name),
                      self.phylo_threshold(name),
                      directory = self.directory
                     )
             ) for name in remo_universe
     )
Example #2
0
 def __init__(self, factor_synonyms):
     self.factor_synonyms = factor_synonyms
     for acc in biopsy.get_transfac_pssm_accessions(
             biopsy.transfac.PssmFilter.all_pssms()):
         for factor in biopsy.transfac.TableLink(acc).entry.factors:
             self[acc].add(
                 self.factor_synonyms.get_synonym(factor.link.entry.name))
Example #3
0
 def __init__(self,
              bridge_sets,
              ppi_network,
              transfac_2_network,
              tag='',
              pssm_names=None,
              directory='.'):
     self.bridge_sets = bridge_sets
     self.ppi_network = ppi_network
     self.transfac_2_network = transfac_2_network
     self.directory = directory
     self.tag = tag
     if None == pssm_names:
         self.pssm_names = biopsy.get_transfac_pssm_accessions(
             biopsy.get_default_transfac_pssm_filter())
     else:
         self.pssm_names = biopsy.SequenceVec()
         for name in pssm_names:
             self.pssm_names.append(name)
     self.sequence_centre_key_regex = re.compile("[Mm]ouse")
     self.default_threshold = 0.01
     self.override_thresholds = {}
     self.default_phylo_threshold = 0.001
     self.override_phylo = {}
     self.use_max_chain = False
     remo_universe = set(
         chain(*list(names for s, names in bridge_sets.iteritems())))
     self.remos = dict((name,
                        Remo(name,
                             self.sequence_centre_key_regex,
                             self.threshold(name),
                             self.phylo_threshold(name),
                             directory=self.directory))
                       for name in remo_universe)
Example #4
0
def test_pssm_distributions():
    pssm_acc = 'M00750'
    transfac_pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    print 'Got', len(transfac_pssms), 'pssms'

    # initialise the distributions
    score_dists = {}
    for tp in transfac_pssms:
        score_dists[tp] = {}
        for k in range(1, 100):
            score_dists[tp][k] = 0
    score_dists['all'] = {}
    for k in range(1, 100):
        score_dists['all'][k] = 0

    # parse the mouse chromosome, score the pssms and fill in the distributions
    bases = 0
    seq = ''
    start = time.clock()
    for line in open(
            'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa',
            'r'):
        if line.startswith('>'): continue
        seq += line.strip('\r\n').replace('N', '')
        # Take 1kb at a time
        if len(seq) >= 1000:
            hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.0)
            for h in hits:
                bin = int(100.0 * h.p_binding)
                if 0 != bin:
                    # print h.binder, bin
                    score_dists[h.binder][bin] += 1
                    score_dists['all'][bin] += 1
            bases += len(seq)
            seq = ''
            print 'Bases:', bases
        if bases >= 2800000: break
    elapsed = time.clock() - start
    print 'Scored', len(
        transfac_pssms), 'pssms on', bases, 'bases in', elapsed, 'seconds'
    print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases
    print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24
    print 'Estimate for # bases/hour:', bases * 3600 / elapsed

    # remember scores for later
    f = open('pssm_p_binding_dists.txt', 'w')
    print 'Writing pssm p(binding) distributions to:', f
    pickle.dump(score_dists, f)
    f.close()
Example #5
0
def test_transfac_pssms():
    transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    for p in transfac_pssms:
        print p, biopsy.get_transfac_pssm_name( p )
    print 'Have', len( transfac_pssms ), 'transfac pssms'
    for acc in [ 'R19099', 'M00418' ]:
        print acc, biopsy.get_transfac_pssm_name( acc )
        biopsy.get_pssm( acc )
        print 'Under pssm'
        for under_pssm in biopsy.get_pssm( acc ).get_dist( True, False ):
            print under_pssm
        print 'Under background'
        for under_background in biopsy.get_pssm( acc ).get_dist( False, False ):
            print under_background
Example #6
0
def test_score_fasta():
    print '******** test_score_fasta()'
    sequences = biopsy.SequenceVec()
    for name, seq in biopsy.parse_fasta(
            'c:/analysis/keiths/msx1/enhancerD.fa').iteritems():
        print name, ':', len(seq), 'bases'
        sequences.append(seq)
        if len(sequences) >= 3: break
    phylo_result = biopsy.score_pssms_on_phylo_sequences(
        biopsy.get_transfac_pssm_accessions(
            biopsy.get_default_transfac_pssm_filter()), sequences, 0.05)
    print 'Max Chain:'
    print phylo_result[1]
    # print biopsy.sort_hits_by_position( phylo_result[ 0 ] )
    print 'Got', len(phylo_result[0]), 'hits from', len(sequences[0]), 'bases'
Example #7
0
def test_transfac_pssms():
    transfac_pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    for p in transfac_pssms:
        print p, biopsy.get_transfac_pssm_name(p)
    print 'Have', len(transfac_pssms), 'transfac pssms'
    for acc in ['R19099', 'M00418']:
        print acc, biopsy.get_transfac_pssm_name(acc)
        biopsy.get_pssm(acc)
        print 'Under pssm'
        for under_pssm in biopsy.get_pssm(acc).get_dist(True, False):
            print under_pssm
        print 'Under background'
        for under_background in biopsy.get_pssm(acc).get_dist(False, False):
            print under_background
Example #8
0
def test_score_fasta():
    print '******** test_score_fasta()'
    sequences = biopsy.SequenceVec()
    for name, seq in biopsy.parse_fasta( 'c:/analysis/keiths/msx1/enhancerD.fa' ).iteritems():
        print name, ':', len( seq ), 'bases'
        sequences.append( seq )
        if len( sequences) >= 3: break
    phylo_result = biopsy.score_pssms_on_phylo_sequences(
            biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ),
            sequences,
            0.05 )
    print 'Max Chain:'
    print phylo_result[ 1 ]
    # print biopsy.sort_hits_by_position( phylo_result[ 0 ] )
    print 'Got', len( phylo_result[ 0 ] ), 'hits from', len( sequences[ 0 ] ), 'bases'
Example #9
0
def test_score_pssms():
    # 'V$AP1_Q2'
    transfac_pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    print 'Got', len(transfac_pssms), 'pssms'
    seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga'
    # seq = 'acatcat'
    # seq = 'gat'
    # hits = biopsy.HitVec()
    hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.05)
    print hits
    print 'score_pssm_on_sequence: Got', len(hits), 'hits from', len(
        seq), 'bases'
    hits = biopsy.analyse(seq, 0.05)
    # print hits
    print 'analyse: Got', len(hits), 'hits from', len(seq), 'bases'
Example #10
0
def test_remome_analysis():
    print '**************** test_remome_analysis ****************'
    analysis = biopsy.RemomeAnalysis(
        biopsy.Remome.load('C:/Data/ReMos/remo_space.bin'))
    analysis.analysis.serialise('analysis.bin')
    analysis_copy = biopsy.Analysis.deserialise('analysis.bin')
    pssms = biopsy.get_transfac_pssm_accessions(
        biopsy.get_default_transfac_pssm_filter())
    threshold = 0.05
    phylo_threshold = 0.02
    try:
        analysis.analyse(pssms, threshold, phylo_threshold)
        analysis.analysis.serialise('analysis.bin')
    except:
        analysis.analysis.serialise('analysis.bin')
        raise
Example #11
0
def test_pssm_distributions():
    pssm_acc = 'M00750'
    transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    print 'Got', len( transfac_pssms ), 'pssms'

    # initialise the distributions
    score_dists = { }
    for tp in transfac_pssms:
        score_dists[ tp ] = { }
        for k in range(1,100):
            score_dists[ tp ][ k ] = 0
    score_dists[ 'all' ] = { }
    for k in range(1,100):
        score_dists[ 'all' ][ k ] = 0

    # parse the mouse chromosome, score the pssms and fill in the distributions
    bases = 0
    seq = ''
    start = time.clock()
    for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r' ):
        if line.startswith( '>' ): continue
        seq += line.strip( '\r\n' ).replace( 'N', '' )
        # Take 1kb at a time
        if len( seq ) >= 1000:
            hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.0 )
            for h in hits:
                bin = int( 100.0 * h.p_binding )
                if 0 != bin:
                    # print h.binder, bin
                    score_dists[ h.binder ][ bin ] += 1
                    score_dists[ 'all' ][ bin ] += 1
            bases += len( seq )
            seq = ''
            print 'Bases:', bases
        if bases >= 2800000: break
    elapsed = time.clock() - start
    print 'Scored', len( transfac_pssms ), 'pssms on', bases, 'bases in', elapsed, 'seconds'
    print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases
    print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24
    print 'Estimate for # bases/hour:', bases * 3600 / elapsed

    # remember scores for later
    f = open( 'pssm_p_binding_dists.txt', 'w' )
    print 'Writing pssm p(binding) distributions to:', f
    pickle.dump(score_dists, f)
    f.close()
Example #12
0
def test_remome_analysis():
    print '**************** test_remome_analysis ****************'
    analysis = biopsy.RemomeAnalysis( biopsy.Remome.load('C:/Data/ReMos/remo_space.bin') )
    analysis.analysis.serialise( 'analysis.bin' )
    analysis_copy = biopsy.Analysis.deserialise( 'analysis.bin' )
    pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    threshold = 0.05
    phylo_threshold = 0.02
    try:
        analysis.analyse(
                pssms,
                threshold,
                phylo_threshold
        )
        analysis.analysis.serialise( 'analysis.bin' )
    except:
        analysis.analysis.serialise( 'analysis.bin' )
        raise
Example #13
0
def test_score_pssms():
    # 'V$AP1_Q2'
    transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() )
    print 'Got', len( transfac_pssms ), 'pssms'
    seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga'
    # seq = 'acatcat'
    # seq = 'gat'
    # hits = biopsy.HitVec()
    hits = biopsy.score_pssms_on_sequence(
            transfac_pssms,
            seq,
            0.05 )
    print hits
    print 'score_pssm_on_sequence: Got', len( hits ), 'hits from', len( seq ), 'bases'
    hits = biopsy.analyse(
            seq,
            0.05)
    # print hits
    print 'analyse: Got', len( hits ), 'hits from', len( seq ), 'bases'
def pssm_accs():
    "The pssms we will test"
    pssm_filter = biopsy.transfac.PssmFilter(
        # name_regex_pattern = 'NFAT_Q4_01'
    )
    return biopsy.get_transfac_pssm_accessions(pssm_filter)
Example #15
0
        print 'Getting database references and names for all matrices'
        map = build_map.build_map()
        matrix_links = dict((m.acc.as_db_ref(), map.links(m.acc.as_db_ref())) for m in T.Matrix.all())
        matrix_names = dict((m.acc.as_db_ref(), names_for_matrix(m)) for m in T.Matrix.all())

    # look at one matrix in particular
    mat = T.Matrix(1151)
    mat_links = map.links(mat.acc.as_db_ref())
    myod_gene = T.DbRef.parse_as('17927', T.db.entrez_gene)
    bind = psimi.networks['BIND']
    myod_in_bind = bind.nodes_for_ref(myod_gene)
    myod_node = bind.nodes_for_name('MYOD')[0]
    myod_interactor = bind.interactor_for_node(myod_node)

    # those pssms we are interested in
    pssm_accs = [
            T.TableLink(acc)
            for acc
            in biopsy.get_transfac_pssm_accessions(
                    biopsy.get_default_transfac_pssm_filter()
            )
    ]

    for db_name in psimi.dbs():
        print
        print db_name
        network = psimi.networks[db_name]
        print network.summary()
        transfac_2_network = psimi.Transfac2Network(map, network)
        print transfac_2_network.coverage_summary(pssm_accs)
Example #16
0
 def __init__(self, factor_synonyms):
     self.factor_synonyms = factor_synonyms
     for acc in biopsy.get_transfac_pssm_accessions(biopsy.transfac.PssmFilter.all_pssms()):
         for factor in biopsy.transfac.TableLink(acc).entry.factors:
             self[acc].add(self.factor_synonyms.get_synonym(factor.link.entry.name))
Example #17
0
def pssm_accs():
    "The pssms we will test"
    pssm_filter = biopsy.transfac.PssmFilter(
        #name_regex_pattern = 'NFAT_Q4_01'
    )
    return biopsy.get_transfac_pssm_accessions(pssm_filter)