def main(qbed, sbed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = determine_blast_type(blast_path,mask) # legacy_bl2seq = "%s " % blast_path + \ # " bl2seq -p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ # " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ # -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ # | grep -v 'WARNING' | grep -v 'ERROR' " # # bl2seq = legacy_bl2seq # # bl2seq = "%s " % blast_path + \ # " -p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ # " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ # -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ # | grep -v 'WARNING' | grep -v 'ERROR' " # fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallization spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################# cmds = [c for c in map(get_cmd, [l for l in pairs if l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] #print >>sys.stderr ,cmds results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) #################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def main(cns_bed, ortho_bed, pairs_file, qpad, spad, blast_path, ncpu, mask): "imput a cns_dict and otholog_dict, cns_bed " pool = Pool(ncpu) fcns = sys.stdout print >> fcns, "#cns_start, cns_stop, interval" bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_fastas(cns_bed, True) sfastas = get_fastas(ortho_bed, False) pairs = [True] _get_pair_gen = get_pair(pairs_file ,'pck', ortho_bed, cns_bed) def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] def get_cmd(pairs): qfeat, sfeat = pairs if pairs is None: return None sfasta = sfastas[sfeat['seqid']] qfasta = qfastas[qfeat['seqid']] qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad m = qstop - qstart n = sstop - sstart e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value) return cmd, qfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #cmd = get_cmd(qfeat, sfeat, qpad, spad, mask) for res,(cmd, qfeat) in zip(results,cmds): if not res.strip(): continue #ask tom? interval = parse_blast(res, qfeat) if len(interval) >= 1: print >> fcns, '{0},{1},{2}'.format(qfeat['start'], qfeat['end'], interval)
def run(): import random from processing import Pool p=Pool(2) print "ahora vemos si escala" numero = 5000000 a = [random.randint(0, 100) for a in xrange(0, numero)] print "Ya tenemos los numeros" lista = [a[:numero/2] , a[numero/2:]] print "Lista bisectada" result=p.mapAsync(my_sort, lista) print "threads lanzados" lista1, lista2 = result.get() print "Uniendo listas" b = my_merge(lista1, lista2) # b = my_sort(a) print "largo", len(b), "llamadas a my_sort"
def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(options.ncpu) bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None qfeat, sfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart) assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart) m = qstop - qstart n = sstop - sstart # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller # e_value = 2.11 # else: e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value) #print >>sys.stderr, "%s" % (cmd) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def test(): print 'cpuCount() = %d\n' % cpuCount() # # Create pool # PROCESSES = 4 print 'Creating pool with %d processes\n' % PROCESSES pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.applyAsync(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imapUnordered(calculatestar, TASKS) print 'Ordered results using pool.applyAsync():' for r in results: print '\t', r.get() print print 'Ordered results using pool.imap():' for x in imap_it: print '\t', x print print 'Unordered results using pool.imapUnordered():' for x in imap_unordered_it: print '\t', x print print 'Ordered results using pool.map() --- will block till complete:' for x in pool.map(calculatestar, TASKS): print '\t', x print # # Simple benchmarks # N = 100000 print 'def pow3(x): return x**3' t = time.time() A = map(pow3, xrange(N)) print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() B = pool.map(pow3, xrange(N)) print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N // 8)) print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print L = [None] * 1000000 print 'def noop(x): pass' print 'L = [None] * 1000000' t = time.time() A = map(noop, L) print '\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() B = pool.map(noop, L) print '\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L) // 8)) print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print del A, B, C, L # # Test error handling # print 'Testing error handling:' try: print pool.apply(f, (5, )) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.apply()' else: raise AssertionError, 'expected ZeroDivisionError' try: print pool.map(f, range(10)) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.map()' else: raise AssertionError, 'expected ZeroDivisionError' try: print list(pool.imap(f, range(10))) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from list(pool.imap())' else: raise AssertionError, 'expected ZeroDivisionError' it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError, 'expected ZeroDivisionError' assert i == 9 print '\tGot ZeroDivisionError as expected from IMapIterator.next()' print # # Testing timeouts # print 'Testing ApplyResult.get() with timeout:', res = pool.applyAsync(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print print print 'Testing IMapIterator.next() with timeout:', it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print print # # Testing callback # print 'Testing callback:' A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.applyAsync(mul, (7, 8), callback=A.append) r.wait() r = pool.mapAsync(pow3, range(10), callback=A.extend) r.wait() if A == B: print '\tcallbacks succeeded\n' else: print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print 'Testing close():' for worker in pool._pool: assert worker.isAlive() result = pool.applyAsync(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.isAlive() print '\tclose() succeeded\n' # # Check terminate() method # print 'Testing terminate():' pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.isAlive() print '\tterminate() succeeded\n' # # Check garbage collection # print 'Testing garbage collection:' pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.isAlive() print '\tgarbage collection succeeded\n'
import os, shutil, subprocess, zipfile, random from processing import Pool from zipfile import ZipFile from config import * def unzip(params): base, ext = params zf = zipfile.ZipFile(rawDir + '/' + base + '.' + ext, 'r') for name in zf.namelist(): if name != 'PIC/': print name zf.extract(name, extractDir) files = [] for f in os.listdir(rawDir): if (f.endswith('exe')): base, ext = f.split('.') if (len(base) == 3): files.append((base, ext)) p = Pool() p.map(unzip, files)
def main(qbed, sbed, pairs_file, pair_fmt, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(options.ncpu) bl2seq = "~/src/blast-2.2.25/bin/bl2seq " \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qaccn,qseqid,saccn,[sleft_gene,sright_gene],sseqid,res"#"qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file , pair_fmt, sbed, qbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None sfeat, qfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] sstart, sstop = sfeat['start'], sfeat['end'] #region gets no padding qstart, qstop = grab_flanking_region(qfeat, sfeat) # sfeat here is the final table with sfeat info from qfeat dict m = sstop - sstart n = qstop - qstart # if (m*n) >= 812045000: # if the database and query is large keep e_value at 2.11 else change it to something smaller # e_value = 2.11 # else: e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=e_value) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat["accn"]), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] #urls = url_params(cnss, qfeat['seqid'], sfeat['seqid'], qfeat['ORG2_qfeat']) print >> fcnss, "%s,%s,%s,[%s,%s],%s,%s" % (qname, qfeat['seqid'], sname, sfeat['qleft_gene'], sfeat['qright_gene'], sfeat['seqid'], ",".join(map(lambda l: ",".join(map(str,l)), cnss))) return None
def main(qbed, sbed,cns_bed, pairs_file, qpad, spad, pair_fmt, blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,evalue...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, cns_bed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None qfeat, sfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] qstart, qstop = max(qfeat['start'] - qpad, 1), qfeat['end'] + qpad sstart, sstop = max(sfeat['start'] - spad, 1), sfeat['end'] + spad assert qstop - qstart > 2 * qpad or qstart == 1, (qstop, qstart) assert sstop - sstart > 2 * spad or sstart == 1, (sstop, sstart) #m = qstop - qstart #n = sstop - sstart #e_value = m*n*(2**(-28.51974)) # bit score above 15/15 noise #assert e_value > 0 cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop, e_value=30) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, cns_bed, sbed, qpad, spad) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def main(cns_file,qdups_path,sdups_path,pair_file,fmt,qbed,sbed,qpad,spad,blast_path,mask='F',ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".all.nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".all.nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".all.localdups" slocaldups_path = sbed.path.split(".")[0] + ".all.localdups" npair_file,nqlocaldups,nslocaldups, ncns_file = map(make_copy_of_file,[pair_file,qlocaldups_path,slocaldups_path,cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups,rdups = get_pairs(pair_file,fmt,qdups,sdups) print len(dups), len(rdups) ldups = get_large_dups(dups,qdups,sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent,sparent) for qparent,sparent in dups if qparent in rdups and sparent in rdups] for (qparent,sparent) in dups: if skip_pair(qparent,sparent,rdups,rdups_both,ldups):continue cnss_size = [] qfeat_dups = get_all_dups(qdups,qparent) sfeat_dups = get_all_dups(sdups,sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups,sfeat_dups,qbed,sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad,spad) print >>sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str,l)),cnss)) cnss_size.append((len(cnss)*-1,qfeat["start"],sfeat["start"],qfeat["accn"],sfeat["accn"],cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent,sparent) in rdups_dic[qparent].keys(): logging.info((qparent,sparent)) rdups_dic[qparent].update({(qparent,sparent):cnss_size}) elif sparent in rdups: if (qparent,sparent) in rdups_dic[sparent].keys(): logging.info((qparent,sparent)) rdups_dic[sparent].update({(qparent,sparent):cnss_size}) else: cnss_size.sort() cns_number,qfeat_start,sfeat_start,qaccn,saccn,largest_cnss = cnss_size[0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >>sys.stderr, "FINAL: {0},{1},{2}".format(qaccn,saccn,cns_number) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,cnss_size,qparent,sparent,qfeat,sfeat,qdups,sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): #print dparents qparent,sparent = dparents #print parents,best_reps[parents] ### one or list? cnss[0]? cns_number,qfeat_start, sfeat_start,qaccn,saccn,largest_cnss = best_reps[dparents] qfeat= qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,[best_reps[dparents]],qparent,sparent,qfeat,sfeat,qdups,sdups) write_nolocaldups(qbed.path,nqlocaldups,"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups(sbed.path,nslocaldups,"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file,'pair',"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]),"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
import os, shutil, subprocess, zipfile, random from processing import Pool from zipfile import ZipFile from config import * def unzip(params): base, ext = params zf = zipfile.ZipFile(rawDir+'/'+base+'.'+ext, 'r') for name in zf.namelist(): if name != 'PIC/': print name zf.extract(name, extractDir) files = [] for f in os.listdir(rawDir): if(f.endswith('exe')): base, ext = f.split('.') if(len(base) == 3): files.append((base,ext)) p=Pool() p.map(unzip, files)
def test(): print 'cpuCount() = %d\n' % cpuCount() # # Create pool # PROCESSES = 4 print 'Creating pool with %d processes\n' % PROCESSES pool = Pool(PROCESSES) # # Tests # TASKS = [(mul, (i, 7)) for i in range(10)] + \ [(plus, (i, 8)) for i in range(10)] results = [pool.applyAsync(calculate, t) for t in TASKS] imap_it = pool.imap(calculatestar, TASKS) imap_unordered_it = pool.imapUnordered(calculatestar, TASKS) print 'Ordered results using pool.applyAsync():' for r in results: print '\t', r.get() print print 'Ordered results using pool.imap():' for x in imap_it: print '\t', x print print 'Unordered results using pool.imapUnordered():' for x in imap_unordered_it: print '\t', x print print 'Ordered results using pool.map() --- will block till complete:' for x in pool.map(calculatestar, TASKS): print '\t', x print # # Simple benchmarks # N = 100000 print 'def pow3(x): return x**3' t = time.time() A = map(pow3, xrange(N)) print '\tmap(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() B = pool.map(pow3, xrange(N)) print '\tpool.map(pow3, xrange(%d)):\n\t\t%s seconds' % \ (N, time.time() - t) t = time.time() C = list(pool.imap(pow3, xrange(N), chunksize=N//8)) print '\tlist(pool.imap(pow3, xrange(%d), chunksize=%d)):\n\t\t%s' \ ' seconds' % (N, N//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print L = [None] * 1000000 print 'def noop(x): pass' print 'L = [None] * 1000000' t = time.time() A = map(noop, L) print '\tmap(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() B = pool.map(noop, L) print '\tpool.map(noop, L):\n\t\t%s seconds' % \ (time.time() - t) t = time.time() C = list(pool.imap(noop, L, chunksize=len(L)//8)) print '\tlist(pool.imap(noop, L, chunksize=%d)):\n\t\t%s seconds' % \ (len(L)//8, time.time() - t) assert A == B == C, (len(A), len(B), len(C)) print del A, B, C, L # # Test error handling # print 'Testing error handling:' try: print pool.apply(f, (5,)) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.apply()' else: raise AssertionError, 'expected ZeroDivisionError' try: print pool.map(f, range(10)) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from pool.map()' else: raise AssertionError, 'expected ZeroDivisionError' try: print list(pool.imap(f, range(10))) except ZeroDivisionError: print '\tGot ZeroDivisionError as expected from list(pool.imap())' else: raise AssertionError, 'expected ZeroDivisionError' it = pool.imap(f, range(10)) for i in range(10): try: x = it.next() except ZeroDivisionError: if i == 5: pass except StopIteration: break else: if i == 5: raise AssertionError, 'expected ZeroDivisionError' assert i == 9 print '\tGot ZeroDivisionError as expected from IMapIterator.next()' print # # Testing timeouts # print 'Testing ApplyResult.get() with timeout:', res = pool.applyAsync(calculate, TASKS[0]) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % res.get(0.02)) break except TimeoutError: sys.stdout.write('.') print print print 'Testing IMapIterator.next() with timeout:', it = pool.imap(calculatestar, TASKS) while 1: sys.stdout.flush() try: sys.stdout.write('\n\t%s' % it.next(0.02)) except StopIteration: break except TimeoutError: sys.stdout.write('.') print print # # Testing callback # print 'Testing callback:' A = [] B = [56, 0, 1, 8, 27, 64, 125, 216, 343, 512, 729] r = pool.applyAsync(mul, (7, 8), callback=A.append) r.wait() r = pool.mapAsync(pow3, range(10), callback=A.extend) r.wait() if A == B: print '\tcallbacks succeeded\n' else: print '\t*** callbacks failed\n\t\t%s != %s\n' % (A, B) # # Check there are no outstanding tasks # assert not pool._cache, 'cache = %r' % pool._cache # # Check close() methods # print 'Testing close():' for worker in pool._pool: assert worker.isAlive() result = pool.applyAsync(time.sleep, [0.5]) pool.close() pool.join() assert result.get() is None for worker in pool._pool: assert not worker.isAlive() print '\tclose() succeeded\n' # # Check terminate() method # print 'Testing terminate():' pool = Pool(2) ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] pool.terminate() pool.join() for worker in pool._pool: assert not worker.isAlive() print '\tterminate() succeeded\n' # # Check garbage collection # print 'Testing garbage collection:' pool = Pool(2) processes = pool._pool ignore = pool.apply(pow3, [2]) results = [pool.applyAsync(time.sleep, [10]) for i in range(10)] del results, pool time.sleep(0.2) for worker in processes: assert not worker.isAlive() print '\tgarbage collection succeeded\n'
def main(qbed,sbed,missed_pairs, ncpu): """run tblastx on missed pairs...""" #print >>sys.stderr,ncpu ncpu = int(ncpu) pool = Pool(ncpu) pairs_file = get_pairs_file(missed_pairs) print >>sys.stdout, "#hit,ref_gene,blastn_introns,blastx_hits, blastx_gene_hits, blastx_frame, blastn_gaps, blastx_gaps,orf_perdiction,orf_blastx,frame_shift" blastn = "/Users/gturco/blast-2.2.25/bin/bl2seq -p blastn -G 5 -E 2 -W 7 -q -2 -e 0.001 -D 1 -i {0} -j {1} -I {2},{3} -J {4},{5} | grep -v '#' | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = split_fastas(qbed)#MASK CODING sfastas = get_mask_non_cds(sbed) #mask noncoding pairs = [True] _get_pair_gen = get_pair(pairs_file,"pair", qbed,sbed) def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] def get_blastn_cmd(pair): """creates the dictionary values used to fill in blast cmd""" if pair is None: return None hit, gene = pair hstart, hstop = abs(3000 - hit['start']), (3000 + hit['end']) # double check fasta to make sure i dont need to add or remove one gstart,gstop = gene['start'],gene['end'] # checks the entire gene... query_file = qfastas[hit['seqid']] subject_file = sfastas[gene['seqid']] blastn_cmd = blastn.format(query_file, subject_file, hstart, hstop, gstart, gstop) #print >> sys.stderr,'{0},{1},{2}'.format(hit['accn'],gene['accn'],cmd) return blastn_cmd,hit, gene cmds = [c for c in map(get_blastn_cmd, [l for l in pairs if l]) if c] #print >>sys.stderr, "results: {0}".format(cmds[0][0]) results = (r for r in pool.map(commands.getoutput,[c[0] for c in cmds])) for res, (cmd, hit, gene) in zip(results,cmds): print >>sys.stderr, "CMD: {0},{1}".format(gene['accn'],hit['accn']) d,no_res = group_cds(res, gene) gap_list =[] intron_list = [] hit['locs'] = [] if no_res == True: continue for group_key in d.keys(): exon_hits = d[group_key] non_crossing = remove_crossing_hits(exon_hits,hit,gene) if len(non_crossing) > 1: gaps,hstart,hend =bites(non_crossing) gap_list.append(sum(gaps)) elif len(non_crossing) == 1: # print >>sys.stderr, non_crossing [(hstart,hend,sstart,send,evalue)] = non_crossing if len(non_crossing) >= 1: intron_list.append(group_key[0]) hit['locs'].append((hstart,hend)) hit['locs'].sort() #print >>sys.stderr, "hit_loc : {0}".format(hit['locs']) if len(hit['locs']) < 1: continue orf_prediction = find_orf(qbed,hit) introns = "{0}/{1}".format(len(intron_list),len(gene['locs'])) gap_totaln = sum(gap_list) # new hit locs made from blastn res hit_percent, gene_percent, frame_percent,frame_shift, best_frame, gap_total,orf_start= protein_parse(hit,gene,sbed,qbed) orf_start = abs(min(hit['locs'][0]) + int(orf_start)) w ="{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10}".format(hit['accn'],gene['accn'],introns,hit_percent,gene_percent, frame_percent,gap_totaln,gap_total,orf_prediction,orf_start,frame_shift) print >>sys.stdout, w
import numpy as np import math def f(x): print x y = [1]*10000000 [math.exp(i) for i in y] def g(x): print x y = np.ones(10000000) np.exp(y) from handythread import foreach from processing import Pool from timings import f,g def fornorm(f,l): for i in l: f(i) %timeit fornorm(f,range(10)) %timeit foreach(g,range(100),threads=2) %timeit foreach(f,range(10),threads=2) p = Pool(2) %timeit p.map(g,range(100)) %timeit p.map(f,range(100))
def main(qbed, sbed, pairs_file, pad, pair_fmt, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(options.ncpu) bl2seq = "/usr/bin/bl2seq " \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -Y 812045000 -d 26195 -e 2.11 -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. def get_cmd(pair): if pair is None: return None qfeat, sfeat = pair #if qfeat['accn'] != "Bradi4g01820": return None #print >>sys.stderr, qfeat, sfeat qfasta = qfastas[qfeat['seqid']] sfasta = sfastas[sfeat['seqid']] qstart, qstop = max(qfeat['start'] - pad, 1), qfeat['end'] + pad sstart, sstop = max(sfeat['start'] - pad, 1), sfeat['end'] + pad assert qstop - qstart > 2 * pad or qstart == 1, (qstop, qstart) assert sstop - sstart > 2 * pad or sstart == 1, (sstop, sstart) cmd = bl2seq % dict(qfasta=qfasta, sfasta=sfasta, qstart=qstart, sstart=sstart, qstop=qstop, sstop=sstop) return cmd, qfeat, sfeat cmds = [c for c in map(get_cmd, [l for l in pairs if l]) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) #results = (r for r in map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, pad) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def main(cns_file, qdups_path, sdups_path, pair_file, fmt, qbed, sbed, qpad, spad, blast_path, unmasked_fasta, mask='F', ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas( sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".localdups" slocaldups_path = sbed.path.split(".")[0] + ".localdups" npair_file, nqlocaldups, nslocaldups, ncns_file = map( make_copy_of_file, [pair_file, qlocaldups_path, slocaldups_path, cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups, rdups = get_pairs(pair_file, fmt, qdups, sdups) print len(dups), len(rdups) ldups = get_large_dups(dups, qdups, sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent, sparent) for qparent, sparent in dups if qparent in rdups and sparent in rdups] for (qparent, sparent) in dups: if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue cnss_size = [] qfeat_dups = get_all_dups(qdups, qparent) sfeat_dups = get_all_dups(sdups, sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [ c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map, qfastas_map, sfastas_map, qpad_map, spad_map) if c ] results = ( r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >> sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss)) cnss_size.append( (len(cnss) * -1, qfeat["start"], sfeat["start"], qfeat["accn"], sfeat["accn"], cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent, sparent) in rdups_dic[qparent].keys(): logging.info((qparent, sparent)) rdups_dic[qparent].update({(qparent, sparent): cnss_size}) elif sparent in rdups: if (qparent, sparent) in rdups_dic[sparent].keys(): logging.info((qparent, sparent)) rdups_dic[sparent].update({(qparent, sparent): cnss_size}) else: cnss_size.sort() cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[ 0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >> sys.stderr, "FINAL: {0},{1},{2}".format( qaccn, saccn, cns_number) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, cnss_size, qparent, sparent, qfeat, sfeat, qdups, sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): qparent, sparent = dparents ### one or list? cnss[0]? cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[ dparents] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, [best_reps[dparents]], qparent, sparent, qfeat, sfeat, qdups, sdups) write_nolocaldups( qbed.path, nqlocaldups, "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups( sbed.path, nslocaldups, "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file, 'pair', "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]), "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]), "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
print "Error: cant write the file %s.%s" % (base,ext) else: for line in open(extractedDir+"/"+base+"."+ext, "r"): if len(line) > 5: begin = 0 end = 0 length = 0 output = "" for i in configList: length=int(i) end = begin + length output += line[begin:end]+";" begin = end dest.write(output+"\n") dest.close() print "Done parsing %s.%s" % (base, ext) else: print "File does not exist %s.%s" % (base, ext) files = [] for f in os.listdir(extractedDir): if f != '.DS_Store': base, ext = f.split('.') if ext == 'dat' or int(ext): files.append((base,ext)) random.shuffle(files) p = Pool() p.map(parse, files)