def main(): from hpf.processing import MultiProcessor global _bfp,_db if opts[SEQUENCES]: f = open(opts[SEQUENCES]) opts[SEQUENCES] = [line.strip().split()[0] for line in f if line.strip() != "" and not line.startswith("#")] seqs = defaultdict(lambda: []) for parent_key,domain_key in sequences(sequence_keys=opts[SEQUENCES], experiment_keys=opts[EXPERIMENT]): seqs[parent_key].append(domain_key) # The tasks are grouped by parent_sequence_key tasks = [(pkey,seqs[pkey]) for pkey in seqs] _print(len(tasks)," proteins for processing") lr,mi,fterms = tuple(metrics()) print len(fterms), " functions known" # Be careful to close and serialize the persistent dictionaries try: _bfp = BayesFunctionPredictionDB(lr,mi, fterms) # Open a multi-processor for performing predictions on multiple pool = MultiProcessor(processors=opts[PROCESSORS],raise_errors=True) # Consume the pool generator, ignoring the results for r in pool.run(_predict, tasks, result=_upload,batches=100000): pass finally: for metric in (lr,mi): if hasattr(metric, 'close'): metric.close()
def psiblast_fold(): global db if db==None: db = _connection() cursor = db.cursor() try: print "PSI-Blast and Fold Recognition Query" query = """ select distinct d.domain_sequence_key, d.parent_sequence_key, d.domain_type, i.pdbId, r.chain, n.parent_start, n.parent_stop, length(s.sequence) from hpf.experiment e join hpf.protein p on e.id=p.experiment_key join hpf.domain d on p.sequence_key=d.parent_sequence_key join hpf.sequence s on d.domain_sequence_key=s.id join hpf.pdbSeqRes r on substring(d.parent_id FROM 4)=r.sequence_key join hpf.domainRegion n on d.id=n.domain_key join hpf.pdbIndex i on r.pdb_key=i.id where d.domain_type in ('fold_recognition','psiblast') """ if opts[EXPERIMENT]: query = query+" and e.id in (%s)" % opts[EXPERIMENT] print query cursor.execute(query) tasks = [(domain_key, parent_key, domain_type, pdb_id, chain, int(p_start), int(p_stop), seq_len) for domain_key, parent_key, domain_type, pdb_id, chain, p_start, p_stop, seq_len in cursor.fetchall()] keys = {} # Cartesian product returns many chains/pdbs for the same sequence # Filter these to one pdb/chain per domain by hashing for task in tasks: domain_key, parent_key, domain_type, pdb_id, chain, p_start, p_stop, seq_len = task keys[(domain_key,parent_key)] = task tasks = keys.values() finally: cursor.close(); db.close(); db=None; cursor=None global pdb, manager, chain_lengths #manager = multiprocessing.Manager() #pdb = manager.dict() #chain_len = manager.dict() pool = MultiProcessor(raise_errors=False, modulus=100,processors=6) print "PSI-Blast and Fold Recognition Process" results = [] for result in pool.run(pdb_domain, tasks): if isinstance(result, Exception): print result else: results.append(result) return results
def main(): func_db = _func() func_cursor = func_db.cursor(MySQLdb.cursors.DictCursor) query = """ select b.parent_sequence_key,b.domain_sequence_key, b.mf_acc as acc, b.name, b.pls_llr,b.base_llr,b.type,b.timestamp from %s b where pls_llr > 0 """ % BAYES_TABLE print query func_cursor.execute() tasks = func_cursor.fetchall() func_cursor.close() func_db.close() pool = MultiProcessor(processors=8, modulus=100, raise_errors=False) pool.run(format, tasks, upload)
def main(): # Get all superfamilies and molecular functions global _mf_acc with MySQLdb.connect(db="functionTables",passwd="patrick_nyu") as cursor: # Only use superfamilies with something in the probability table query = """ select distinct acc from functionTables.probability_goLite_062009 where acc like '%.%' """ # query = """select distinct substring_index(sccs,'.',3) # from pdb.astral95_1_75 a # join functionTables.probability_golite_062009 p # on substring_index(a.sccs,'.',3)=p.acc""" print query cursor.execute(query) sccs = [t[0] for t in cursor.fetchall()] print len(sccs)," superfamilies" # Only use molecular functions query = """ select distinct p.acc from functionTables.probability_goLite_062009 p join mygoLite_062009.term t on p.acc=t.acc and t.term_type='molecular_function' and t.acc!='GO:0003674' where p.acc2 is NULL order by p.metric asc """ print query cursor.execute(query) _mf_acc = [t[0] for t in cursor.fetchall()] print len(_mf_acc)," molecular functions" tasks=[] for i,sf1 in enumerate(sccs): for sf2 in sccs[i+1:]: tasks.append((sf1,sf2)) print len(tasks)," pairwise superfamilies" global _prob print "Opening shelve" dict = shelve.open(opts[SHELVE]) _prob = Metric(dict=dict,default=0) pool = MultiProcessor(8, modulus=100, raise_errors=True) for r in pool.run(__calc_corr, tasks, __upload): pass