def __init__(self, bridge_sets, ppi_network, transfac_2_network, tag='', pssm_names=None, directory='.'): self.bridge_sets = bridge_sets self.ppi_network = ppi_network self.transfac_2_network = transfac_2_network self.directory = directory self.tag = tag if None == pssm_names: self.pssm_names = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) else: self.pssm_names = biopsy.SequenceVec() for name in pssm_names: self.pssm_names.append(name) self.sequence_centre_key_regex = re.compile("[Mm]ouse") self.default_threshold = 0.01 self.override_thresholds = {} self.default_phylo_threshold = 0.001 self.override_phylo = {} self.use_max_chain = False remo_universe = set(chain(*list(names for s, names in bridge_sets.iteritems()))) self.remos = dict( ( name, Remo( name, self.sequence_centre_key_regex, self.threshold(name), self.phylo_threshold(name), directory = self.directory ) ) for name in remo_universe )
def __init__(self, factor_synonyms): self.factor_synonyms = factor_synonyms for acc in biopsy.get_transfac_pssm_accessions( biopsy.transfac.PssmFilter.all_pssms()): for factor in biopsy.transfac.TableLink(acc).entry.factors: self[acc].add( self.factor_synonyms.get_synonym(factor.link.entry.name))
def __init__(self, bridge_sets, ppi_network, transfac_2_network, tag='', pssm_names=None, directory='.'): self.bridge_sets = bridge_sets self.ppi_network = ppi_network self.transfac_2_network = transfac_2_network self.directory = directory self.tag = tag if None == pssm_names: self.pssm_names = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) else: self.pssm_names = biopsy.SequenceVec() for name in pssm_names: self.pssm_names.append(name) self.sequence_centre_key_regex = re.compile("[Mm]ouse") self.default_threshold = 0.01 self.override_thresholds = {} self.default_phylo_threshold = 0.001 self.override_phylo = {} self.use_max_chain = False remo_universe = set( chain(*list(names for s, names in bridge_sets.iteritems()))) self.remos = dict((name, Remo(name, self.sequence_centre_key_regex, self.threshold(name), self.phylo_threshold(name), directory=self.directory)) for name in remo_universe)
def test_pssm_distributions(): pssm_acc = 'M00750' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) print 'Got', len(transfac_pssms), 'pssms' # initialise the distributions score_dists = {} for tp in transfac_pssms: score_dists[tp] = {} for k in range(1, 100): score_dists[tp][k] = 0 score_dists['all'] = {} for k in range(1, 100): score_dists['all'][k] = 0 # parse the mouse chromosome, score the pssms and fill in the distributions bases = 0 seq = '' start = time.clock() for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r'): if line.startswith('>'): continue seq += line.strip('\r\n').replace('N', '') # Take 1kb at a time if len(seq) >= 1000: hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.0) for h in hits: bin = int(100.0 * h.p_binding) if 0 != bin: # print h.binder, bin score_dists[h.binder][bin] += 1 score_dists['all'][bin] += 1 bases += len(seq) seq = '' print 'Bases:', bases if bases >= 2800000: break elapsed = time.clock() - start print 'Scored', len( transfac_pssms), 'pssms on', bases, 'bases in', elapsed, 'seconds' print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24 print 'Estimate for # bases/hour:', bases * 3600 / elapsed # remember scores for later f = open('pssm_p_binding_dists.txt', 'w') print 'Writing pssm p(binding) distributions to:', f pickle.dump(score_dists, f) f.close()
def test_transfac_pssms(): transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) for p in transfac_pssms: print p, biopsy.get_transfac_pssm_name( p ) print 'Have', len( transfac_pssms ), 'transfac pssms' for acc in [ 'R19099', 'M00418' ]: print acc, biopsy.get_transfac_pssm_name( acc ) biopsy.get_pssm( acc ) print 'Under pssm' for under_pssm in biopsy.get_pssm( acc ).get_dist( True, False ): print under_pssm print 'Under background' for under_background in biopsy.get_pssm( acc ).get_dist( False, False ): print under_background
def test_score_fasta(): print '******** test_score_fasta()' sequences = biopsy.SequenceVec() for name, seq in biopsy.parse_fasta( 'c:/analysis/keiths/msx1/enhancerD.fa').iteritems(): print name, ':', len(seq), 'bases' sequences.append(seq) if len(sequences) >= 3: break phylo_result = biopsy.score_pssms_on_phylo_sequences( biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()), sequences, 0.05) print 'Max Chain:' print phylo_result[1] # print biopsy.sort_hits_by_position( phylo_result[ 0 ] ) print 'Got', len(phylo_result[0]), 'hits from', len(sequences[0]), 'bases'
def test_transfac_pssms(): transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) for p in transfac_pssms: print p, biopsy.get_transfac_pssm_name(p) print 'Have', len(transfac_pssms), 'transfac pssms' for acc in ['R19099', 'M00418']: print acc, biopsy.get_transfac_pssm_name(acc) biopsy.get_pssm(acc) print 'Under pssm' for under_pssm in biopsy.get_pssm(acc).get_dist(True, False): print under_pssm print 'Under background' for under_background in biopsy.get_pssm(acc).get_dist(False, False): print under_background
def test_score_fasta(): print '******** test_score_fasta()' sequences = biopsy.SequenceVec() for name, seq in biopsy.parse_fasta( 'c:/analysis/keiths/msx1/enhancerD.fa' ).iteritems(): print name, ':', len( seq ), 'bases' sequences.append( seq ) if len( sequences) >= 3: break phylo_result = biopsy.score_pssms_on_phylo_sequences( biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ), sequences, 0.05 ) print 'Max Chain:' print phylo_result[ 1 ] # print biopsy.sort_hits_by_position( phylo_result[ 0 ] ) print 'Got', len( phylo_result[ 0 ] ), 'hits from', len( sequences[ 0 ] ), 'bases'
def test_score_pssms(): # 'V$AP1_Q2' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) print 'Got', len(transfac_pssms), 'pssms' seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' # seq = 'acatcat' # seq = 'gat' # hits = biopsy.HitVec() hits = biopsy.score_pssms_on_sequence(transfac_pssms, seq, 0.05) print hits print 'score_pssm_on_sequence: Got', len(hits), 'hits from', len( seq), 'bases' hits = biopsy.analyse(seq, 0.05) # print hits print 'analyse: Got', len(hits), 'hits from', len(seq), 'bases'
def test_remome_analysis(): print '**************** test_remome_analysis ****************' analysis = biopsy.RemomeAnalysis( biopsy.Remome.load('C:/Data/ReMos/remo_space.bin')) analysis.analysis.serialise('analysis.bin') analysis_copy = biopsy.Analysis.deserialise('analysis.bin') pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter()) threshold = 0.05 phylo_threshold = 0.02 try: analysis.analyse(pssms, threshold, phylo_threshold) analysis.analysis.serialise('analysis.bin') except: analysis.analysis.serialise('analysis.bin') raise
def test_pssm_distributions(): pssm_acc = 'M00750' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) print 'Got', len( transfac_pssms ), 'pssms' # initialise the distributions score_dists = { } for tp in transfac_pssms: score_dists[ tp ] = { } for k in range(1,100): score_dists[ tp ][ k ] = 0 score_dists[ 'all' ] = { } for k in range(1,100): score_dists[ 'all' ][ k ] = 0 # parse the mouse chromosome, score the pssms and fill in the distributions bases = 0 seq = '' start = time.clock() for line in open( 'C:/Data/ensembl/chromosomes/Mus_musculus.NCBIM34.dec.dna.chromosome.1.fa', 'r' ): if line.startswith( '>' ): continue seq += line.strip( '\r\n' ).replace( 'N', '' ) # Take 1kb at a time if len( seq ) >= 1000: hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.0 ) for h in hits: bin = int( 100.0 * h.p_binding ) if 0 != bin: # print h.binder, bin score_dists[ h.binder ][ bin ] += 1 score_dists[ 'all' ][ bin ] += 1 bases += len( seq ) seq = '' print 'Bases:', bases if bases >= 2800000: break elapsed = time.clock() - start print 'Scored', len( transfac_pssms ), 'pssms on', bases, 'bases in', elapsed, 'seconds' print 'Estimate for mouse chromosome 1 (secs):', elapsed * 190000000 / bases print 'Estimate for mouse chromosome 1 (days):', elapsed * 190000000 / bases / 60 / 60 / 24 print 'Estimate for # bases/hour:', bases * 3600 / elapsed # remember scores for later f = open( 'pssm_p_binding_dists.txt', 'w' ) print 'Writing pssm p(binding) distributions to:', f pickle.dump(score_dists, f) f.close()
def test_remome_analysis(): print '**************** test_remome_analysis ****************' analysis = biopsy.RemomeAnalysis( biopsy.Remome.load('C:/Data/ReMos/remo_space.bin') ) analysis.analysis.serialise( 'analysis.bin' ) analysis_copy = biopsy.Analysis.deserialise( 'analysis.bin' ) pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) threshold = 0.05 phylo_threshold = 0.02 try: analysis.analyse( pssms, threshold, phylo_threshold ) analysis.analysis.serialise( 'analysis.bin' ) except: analysis.analysis.serialise( 'analysis.bin' ) raise
def test_score_pssms(): # 'V$AP1_Q2' transfac_pssms = biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) print 'Got', len( transfac_pssms ), 'pssms' seq = 'tacatcatctgtctgcagtagtctaaccgaccccccccagttttagaagcagactgcatgcggacgggaccgcggatcgcgcggtgcgcctcagtgtacttccgaacgaatgagtcattaatagagcgctatatcgtaactgtctttgacgaagtataccgaaaccgtgcagccagacgtgatccgggcgttgtaaaggcgatcagcgccctaggagtaccatttttgccgtaggcttgcgtctcaaagaccagctggggcgtggtatcactcgtcagtacgatttctgccagatagatagcatagactgaaccttaggcccaatagggacacaattacccgagtgactgactggtctaaggggagtccccccttaaaacgttttacgtaatagcgggctccagaagcaaagcatcggtttgagccccagtactaaacgtttgagtgtttgctctcgtctgataggtaaaccgacaagagaaccaagctcaaggcgcggtaggtgcgccttgcgaactgttgatgccgtgagcgccaccatcccgtgcatcataggcagggagagaagaccacatggccttgcgaccgtatgagctgtttcagattaaatgccaacgggcatggtcggtgtccagcattttttgcagtcagctggtggtacacagtggggacaagaacgcctctggtagatgtcttctgaaggagtaactcatttcgttgaatcgaccttcccttgcgcttgaacgcggacctctagtctctctcgcagactggggtcgaaaatcaaggtagatatggaatgttccgcatgagggtagcgaccggatcgggcgtcaagtatatcctccctgctacgtccccctactagcctcagtccgcctcgaacctaggaagattggccacatcagcttggtggatgcctggtccatacttcagacccgagaatgttagacaggaccccatttggctcctttacgtacgatctatgtagacgcagtga' # seq = 'acatcat' # seq = 'gat' # hits = biopsy.HitVec() hits = biopsy.score_pssms_on_sequence( transfac_pssms, seq, 0.05 ) print hits print 'score_pssm_on_sequence: Got', len( hits ), 'hits from', len( seq ), 'bases' hits = biopsy.analyse( seq, 0.05) # print hits print 'analyse: Got', len( hits ), 'hits from', len( seq ), 'bases'
def pssm_accs(): "The pssms we will test" pssm_filter = biopsy.transfac.PssmFilter( # name_regex_pattern = 'NFAT_Q4_01' ) return biopsy.get_transfac_pssm_accessions(pssm_filter)
print 'Getting database references and names for all matrices' map = build_map.build_map() matrix_links = dict((m.acc.as_db_ref(), map.links(m.acc.as_db_ref())) for m in T.Matrix.all()) matrix_names = dict((m.acc.as_db_ref(), names_for_matrix(m)) for m in T.Matrix.all()) # look at one matrix in particular mat = T.Matrix(1151) mat_links = map.links(mat.acc.as_db_ref()) myod_gene = T.DbRef.parse_as('17927', T.db.entrez_gene) bind = psimi.networks['BIND'] myod_in_bind = bind.nodes_for_ref(myod_gene) myod_node = bind.nodes_for_name('MYOD')[0] myod_interactor = bind.interactor_for_node(myod_node) # those pssms we are interested in pssm_accs = [ T.TableLink(acc) for acc in biopsy.get_transfac_pssm_accessions( biopsy.get_default_transfac_pssm_filter() ) ] for db_name in psimi.dbs(): print print db_name network = psimi.networks[db_name] print network.summary() transfac_2_network = psimi.Transfac2Network(map, network) print transfac_2_network.coverage_summary(pssm_accs)
def __init__(self, factor_synonyms): self.factor_synonyms = factor_synonyms for acc in biopsy.get_transfac_pssm_accessions(biopsy.transfac.PssmFilter.all_pssms()): for factor in biopsy.transfac.TableLink(acc).entry.factors: self[acc].add(self.factor_synonyms.get_synonym(factor.link.entry.name))
def pssm_accs(): "The pssms we will test" pssm_filter = biopsy.transfac.PssmFilter( #name_regex_pattern = 'NFAT_Q4_01' ) return biopsy.get_transfac_pssm_accessions(pssm_filter)