def main(qbed, sbed, pairs_file, qpad, spad, unmasked_fasta, pair_fmt,blast_path, mask='F', ncpu=8): """main runner for finding cnss""" pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " fcnss = sys.stdout print >> fcnss, "#qseqid,qaccn,sseqid,saccn,[qstart,qend,sstart,send,bitscore...]" qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas pairs = [True] _get_pair_gen = get_pair(pairs_file, pair_fmt, qbed, sbed) # need this for parallization stuff. def get_pair_gen(): try: return _get_pair_gen.next() except StopIteration: return None while any(pairs): pairs = [get_pair_gen() for i in range(ncpu)] # this helps in parallelizing. spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) #################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l],bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): if not res.strip(): continue print >>sys.stderr, "%s %s" % (qfeat["accn"], sfeat['accn']), orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >>sys.stderr, "(%i)" % len(cnss) if len(cnss) == 0: continue qname, sname = qfeat['accn'], sfeat['accn'] print >> fcnss, "%s,%s,%s,%s,%s" % (qfeat['seqid'], qname, sfeat['seqid'], sname, ",".join(map(lambda l: ",".join(map(str,l)),cnss))) return None
def main(cns_file, qdups_path, sdups_path, pair_file, fmt, qbed, sbed, qpad, spad, blast_path, unmasked_fasta, mask='F', ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas( sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".localdups" slocaldups_path = sbed.path.split(".")[0] + ".localdups" npair_file, nqlocaldups, nslocaldups, ncns_file = map( make_copy_of_file, [pair_file, qlocaldups_path, slocaldups_path, cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups, rdups = get_pairs(pair_file, fmt, qdups, sdups) print len(dups), len(rdups) ldups = get_large_dups(dups, qdups, sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent, sparent) for qparent, sparent in dups if qparent in rdups and sparent in rdups] for (qparent, sparent) in dups: if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue cnss_size = [] qfeat_dups = get_all_dups(qdups, qparent) sfeat_dups = get_all_dups(sdups, sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [ c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map, qfastas_map, sfastas_map, qpad_map, spad_map) if c ] results = ( r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >> sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss)) cnss_size.append( (len(cnss) * -1, qfeat["start"], sfeat["start"], qfeat["accn"], sfeat["accn"], cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent, sparent) in rdups_dic[qparent].keys(): logging.info((qparent, sparent)) rdups_dic[qparent].update({(qparent, sparent): cnss_size}) elif sparent in rdups: if (qparent, sparent) in rdups_dic[sparent].keys(): logging.info((qparent, sparent)) rdups_dic[sparent].update({(qparent, sparent): cnss_size}) else: cnss_size.sort() cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[ 0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >> sys.stderr, "FINAL: {0},{1},{2}".format( qaccn, saccn, cns_number) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, cnss_size, qparent, sparent, qfeat, sfeat, qdups, sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): qparent, sparent = dparents ### one or list? cnss[0]? cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[ dparents] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, [best_reps[dparents]], qparent, sparent, qfeat, sfeat, qdups, sdups) write_nolocaldups( qbed.path, nqlocaldups, "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups( sbed.path, nslocaldups, "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file, 'pair', "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]), "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]), "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
def main(cns_file,qdups_path,sdups_path,pair_file,fmt,qbed,sbed,qpad,spad,blast_path,mask='F',ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".all.nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".all.nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".all.localdups" slocaldups_path = sbed.path.split(".")[0] + ".all.localdups" npair_file,nqlocaldups,nslocaldups, ncns_file = map(make_copy_of_file,[pair_file,qlocaldups_path,slocaldups_path,cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups,rdups = get_pairs(pair_file,fmt,qdups,sdups) print len(dups), len(rdups) ldups = get_large_dups(dups,qdups,sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent,sparent) for qparent,sparent in dups if qparent in rdups and sparent in rdups] for (qparent,sparent) in dups: if skip_pair(qparent,sparent,rdups,rdups_both,ldups):continue cnss_size = [] qfeat_dups = get_all_dups(qdups,qparent) sfeat_dups = get_all_dups(sdups,sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups,sfeat_dups,qbed,sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad,spad) print >>sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str,l)),cnss)) cnss_size.append((len(cnss)*-1,qfeat["start"],sfeat["start"],qfeat["accn"],sfeat["accn"],cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent,sparent) in rdups_dic[qparent].keys(): logging.info((qparent,sparent)) rdups_dic[qparent].update({(qparent,sparent):cnss_size}) elif sparent in rdups: if (qparent,sparent) in rdups_dic[sparent].keys(): logging.info((qparent,sparent)) rdups_dic[sparent].update({(qparent,sparent):cnss_size}) else: cnss_size.sort() cns_number,qfeat_start,sfeat_start,qaccn,saccn,largest_cnss = cnss_size[0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >>sys.stderr, "FINAL: {0},{1},{2}".format(qaccn,saccn,cns_number) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,cnss_size,qparent,sparent,qfeat,sfeat,qdups,sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): #print dparents qparent,sparent = dparents #print parents,best_reps[parents] ### one or list? cnss[0]? cns_number,qfeat_start, sfeat_start,qaccn,saccn,largest_cnss = best_reps[dparents] qfeat= qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,[best_reps[dparents]],qparent,sparent,qfeat,sfeat,qdups,sdups) write_nolocaldups(qbed.path,nqlocaldups,"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups(sbed.path,nslocaldups,"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file,'pair',"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]),"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))