Example #1
0
def main():
    from hpf.processing import MultiProcessor
    global _bfp,_db
    
    if opts[SEQUENCES]:
        f = open(opts[SEQUENCES])
        opts[SEQUENCES] = [line.strip().split()[0] for line in f if line.strip() != "" and not line.startswith("#")]
    
    seqs = defaultdict(lambda: [])
    for parent_key,domain_key in sequences(sequence_keys=opts[SEQUENCES], experiment_keys=opts[EXPERIMENT]):
        seqs[parent_key].append(domain_key)
    # The tasks are grouped by parent_sequence_key
    tasks = [(pkey,seqs[pkey]) for pkey in seqs]
    _print(len(tasks)," proteins for processing")

    
    lr,mi,fterms = tuple(metrics())
    print len(fterms), " functions known"
    
    # Be careful to close and serialize the persistent dictionaries
    try:
        _bfp = BayesFunctionPredictionDB(lr,mi, fterms)
        # Open a multi-processor for performing predictions on multiple
        pool = MultiProcessor(processors=opts[PROCESSORS],raise_errors=True)
        # Consume the pool generator, ignoring the results
        for r in pool.run(_predict, tasks, result=_upload,batches=100000):
            pass
    finally:
        for metric in (lr,mi):
            if hasattr(metric, 'close'):
                metric.close()
Example #2
0
def psiblast_fold():
    global db
    if db==None:
        db = _connection()
    cursor = db.cursor()
    try:
        print "PSI-Blast and Fold Recognition Query"
        query = """
            select distinct d.domain_sequence_key, d.parent_sequence_key,
                d.domain_type, i.pdbId, r.chain, n.parent_start, n.parent_stop,
                length(s.sequence)
            from hpf.experiment e
            join hpf.protein p
            on e.id=p.experiment_key
            join hpf.domain d
            on p.sequence_key=d.parent_sequence_key
            join hpf.sequence s
            on d.domain_sequence_key=s.id
            join hpf.pdbSeqRes r
            on substring(d.parent_id FROM 4)=r.sequence_key
            join hpf.domainRegion n
            on d.id=n.domain_key
            join hpf.pdbIndex i
            on r.pdb_key=i.id
            where d.domain_type in ('fold_recognition','psiblast') 
        """

        if opts[EXPERIMENT]:
            query = query+" and e.id in (%s)" % opts[EXPERIMENT]
        print query
        cursor.execute(query)
        tasks = [(domain_key, parent_key, domain_type, pdb_id, chain, int(p_start), int(p_stop), seq_len) for domain_key, parent_key, domain_type, pdb_id, chain, p_start, p_stop, seq_len in cursor.fetchall()]
        keys = {}
        # Cartesian product returns many chains/pdbs for the same sequence
        # Filter these to one pdb/chain per domain by hashing
        for task in tasks:
            domain_key, parent_key, domain_type, pdb_id, chain, p_start, p_stop, seq_len = task
            keys[(domain_key,parent_key)] = task    
        tasks = keys.values()
    finally:
        cursor.close(); db.close(); db=None; cursor=None
    
    global pdb, manager, chain_lengths
    #manager = multiprocessing.Manager()
    #pdb = manager.dict()
    #chain_len = manager.dict()
    pool = MultiProcessor(raise_errors=False, modulus=100,processors=6)
    print "PSI-Blast and Fold Recognition Process"
    results = []
    for result in pool.run(pdb_domain, tasks):
        if isinstance(result, Exception):
            print result
        else:
            results.append(result)
    return results
Example #3
0
def main():
    func_db = _func()
    func_cursor = func_db.cursor(MySQLdb.cursors.DictCursor)
    query = """
        select b.parent_sequence_key,b.domain_sequence_key,
        b.mf_acc as acc, b.name, b.pls_llr,b.base_llr,b.type,b.timestamp 
        from %s b 
        where pls_llr > 0
        """ % BAYES_TABLE
    print query
    func_cursor.execute()
    tasks = func_cursor.fetchall()
    func_cursor.close()
    func_db.close()
    pool = MultiProcessor(processors=8, modulus=100, raise_errors=False)
    pool.run(format, tasks, upload)
Example #4
0
def main():
    # Get all superfamilies and molecular functions
    global _mf_acc
    with MySQLdb.connect(db="functionTables",passwd="patrick_nyu") as cursor:
        # Only use superfamilies with something in the probability table
        query = """
            select distinct acc 
            from functionTables.probability_goLite_062009 
            where acc like '%.%'
            """
#        query = """select distinct substring_index(sccs,'.',3) 
#            from pdb.astral95_1_75 a 
#            join functionTables.probability_golite_062009 p
#            on substring_index(a.sccs,'.',3)=p.acc"""
        print query
        cursor.execute(query)
        sccs = [t[0] for t in cursor.fetchall()]
        print len(sccs)," superfamilies"
        # Only use molecular functions
        query = """
            select distinct p.acc 
            from functionTables.probability_goLite_062009 p 
            join mygoLite_062009.term t 
            on p.acc=t.acc and t.term_type='molecular_function' and t.acc!='GO:0003674'
            where p.acc2 is NULL
            order by p.metric asc
            """
        print query
        cursor.execute(query)
        _mf_acc = [t[0] for t in cursor.fetchall()]
        print len(_mf_acc)," molecular functions"

    tasks=[]
    for i,sf1 in enumerate(sccs):
        for sf2 in sccs[i+1:]:
            tasks.append((sf1,sf2))
    print len(tasks)," pairwise superfamilies"
            
    global _prob
    print "Opening shelve"
    dict = shelve.open(opts[SHELVE])
    _prob = Metric(dict=dict,default=0)
    pool = MultiProcessor(8, modulus=100, raise_errors=True)
    for r in pool.run(__calc_corr, tasks, __upload):
        pass