def write_new_pairs(pair_file_path,new_pairs,qbed_file_new,sbed_path_new): """ appends the new pairs to the end of the pair f:/wile and then changes it moves it into qa file fmt""" write_file = open(pair_file_path,'a') for pair in new_pairs: new_line = "{0}\t{1}\n".format(pair['qaccn'],pair['saccn']) write_file.write(new_line) write_file.close() header = pair_file_path.split(".")[0] raw_file = "{0}.raw.with_new.filtered".format(header) pairs_to_qa(pair_file_path,'pair',qbed_file_new,sbed_path_new,raw_file)
def write_new_pairs(pair_file_path, new_pairs, qbed_file_new, sbed_path_new): """ appends the new pairs to the end of the pair f:/wile and then changes it moves it into qa file fmt""" write_file = open(pair_file_path, 'a') for pair in new_pairs: new_line = "{0}\t{1}\n".format(pair['qaccn'], pair['saccn']) write_file.write(new_line) write_file.close() header = pair_file_path.split(".")[0] raw_file = "{0}.raw.with_new.filtered".format(header) pairs_to_qa(pair_file_path, 'pair', qbed_file_new, sbed_path_new, raw_file)
def main(cns_file, qdups_path, sdups_path, pair_file, fmt, qbed, sbed, qpad, spad, blast_path, unmasked_fasta, mask='F', ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas( sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".localdups" slocaldups_path = sbed.path.split(".")[0] + ".localdups" npair_file, nqlocaldups, nslocaldups, ncns_file = map( make_copy_of_file, [pair_file, qlocaldups_path, slocaldups_path, cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups, rdups = get_pairs(pair_file, fmt, qdups, sdups) print len(dups), len(rdups) ldups = get_large_dups(dups, qdups, sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent, sparent) for qparent, sparent in dups if qparent in rdups and sparent in rdups] for (qparent, sparent) in dups: if skip_pair(qparent, sparent, rdups, rdups_both, ldups): continue cnss_size = [] qfeat_dups = get_all_dups(qdups, qparent) sfeat_dups = get_all_dups(sdups, sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups, sfeat_dups, qbed, sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [ c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map, qfastas_map, sfastas_map, qpad_map, spad_map) if c ] results = ( r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad, spad, unmasked_fasta) print >> sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str, l)), cnss)) cnss_size.append( (len(cnss) * -1, qfeat["start"], sfeat["start"], qfeat["accn"], sfeat["accn"], cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent, sparent) in rdups_dic[qparent].keys(): logging.info((qparent, sparent)) rdups_dic[qparent].update({(qparent, sparent): cnss_size}) elif sparent in rdups: if (qparent, sparent) in rdups_dic[sparent].keys(): logging.info((qparent, sparent)) rdups_dic[sparent].update({(qparent, sparent): cnss_size}) else: cnss_size.sort() cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = cnss_size[ 0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >> sys.stderr, "FINAL: {0},{1},{2}".format( qaccn, saccn, cns_number) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, cnss_size, qparent, sparent, qfeat, sfeat, qdups, sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): qparent, sparent = dparents ### one or list? cnss[0]? cns_number, qfeat_start, sfeat_start, qaccn, saccn, largest_cnss = best_reps[ dparents] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file, ncns_file, nqlocaldups, nslocaldups, [best_reps[dparents]], qparent, sparent, qfeat, sfeat, qdups, sdups) write_nolocaldups( qbed.path, nqlocaldups, "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups( sbed.path, nslocaldups, "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file, 'pair', "{0}.nolocaldups.bed.local".format(qbed.path.split(".")[0]), "{0}.nolocaldups.bed.local".format(sbed.path.split(".")[0]), "{0}.raw.filtered.local".format(options.pairs.split(".")[0]))
def main(cns_file,qdups_path,sdups_path,pair_file,fmt,qbed,sbed,qpad,spad,blast_path,mask='F',ncpu=8): pool = Pool(ncpu) bl2seq = "%s " % blast_path + \ "-p blastn -D 1 -E 2 -q -2 -r 1 -G 5 -W 7 -F %s " % mask + \ " -e %(e_value).2f -i %(qfasta)s -j %(sfasta)s \ -I %(qstart)d,%(qstop)d -J %(sstart)d,%(sstop)d | grep -v '#' \ | grep -v 'WARNING' | grep -v 'ERROR' " qfastas = get_masked_fastas(qbed) sfastas = get_masked_fastas(sbed) if qbed.filename != sbed.filename else qfastas ################# file paths ##################### qnolocaldups_path = qbed.path.split(".")[0] + ".all.nolocaldups.bed" snolocaldups_path = sbed.path.split(".")[0] + ".all.nolocaldups.bed" qlocaldups_path = qbed.path.split(".")[0] + ".all.localdups" slocaldups_path = sbed.path.split(".")[0] + ".all.localdups" npair_file,nqlocaldups,nslocaldups, ncns_file = map(make_copy_of_file,[pair_file,qlocaldups_path,slocaldups_path,cns_file]) ########################################## qdups = parse_dups(qdups_path) sdups = parse_dups(sdups_path) dups,rdups = get_pairs(pair_file,fmt,qdups,sdups) print len(dups), len(rdups) ldups = get_large_dups(dups,qdups,sdups) rdups_dic = defaultdict(dict) rdups_both = [(qparent,sparent) for qparent,sparent in dups if qparent in rdups and sparent in rdups] for (qparent,sparent) in dups: if skip_pair(qparent,sparent,rdups,rdups_both,ldups):continue cnss_size = [] qfeat_dups = get_all_dups(qdups,qparent) sfeat_dups = get_all_dups(sdups,sparent) pairs = [True] _get_dups_gen = get_dups(qfeat_dups,sfeat_dups,qbed,sbed) def get_dups_gen(): try: return _get_dups_gen.next() except StopIteration: return None while any(pairs): cnss_dups = [] pairs = [get_dups_gen() for i in range(ncpu)] ###this is for parellization######### spad_map = [spad] * len(pairs) qpad_map = [qpad] * len(pairs) sfastas_map = [sfastas] * len(pairs) qfastas_map = [qfastas] * len(pairs) bl2seq_map = [bl2seq] * len(pairs) ################################### cmds = [c for c in map(get_cmd, [l for l in pairs if l], bl2seq_map,qfastas_map,sfastas_map,qpad_map,spad_map) if c] results = (r for r in pool.map(commands.getoutput, [c[0] for c in cmds])) for res, (cmd, qfeat, sfeat) in zip(results, cmds): orient = qfeat['strand'] == sfeat['strand'] and 1 or -1 if not res.strip(): cnss = [] else: cnss = parse_blast(res, orient, qfeat, sfeat, qbed, sbed, qpad,spad) print >>sys.stderr, "(%i)" % len(cnss) cnss_fmt = ",".join(map(lambda l: ",".join(map(str,l)),cnss)) cnss_size.append((len(cnss)*-1,qfeat["start"],sfeat["start"],qfeat["accn"],sfeat["accn"],cnss_fmt)) pairs = [pairs[-1]] ###################################################################### if qparent in rdups: if (qparent,sparent) in rdups_dic[qparent].keys(): logging.info((qparent,sparent)) rdups_dic[qparent].update({(qparent,sparent):cnss_size}) elif sparent in rdups: if (qparent,sparent) in rdups_dic[sparent].keys(): logging.info((qparent,sparent)) rdups_dic[sparent].update({(qparent,sparent):cnss_size}) else: cnss_size.sort() cns_number,qfeat_start,sfeat_start,qaccn,saccn,largest_cnss = cnss_size[0] qfeat = qbed.accn(qaccn) sfeat = sbed.accn(saccn) print >>sys.stderr, "FINAL: {0},{1},{2}".format(qaccn,saccn,cns_number) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,cnss_size,qparent,sparent,qfeat,sfeat,qdups,sdups) best_reps = best_repeats(rdups_dic) for dparents in best_reps.keys(): #print dparents qparent,sparent = dparents #print parents,best_reps[parents] ### one or list? cnss[0]? cns_number,qfeat_start, sfeat_start,qaccn,saccn,largest_cnss = best_reps[dparents] qfeat= qbed.accn(qaccn) sfeat = sbed.accn(saccn) write_new_dups(npair_file,ncns_file,nqlocaldups,nslocaldups,[best_reps[dparents]],qparent,sparent,qfeat,sfeat,qdups,sdups) write_nolocaldups(qbed.path,nqlocaldups,"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0])) write_nolocaldups(sbed.path,nslocaldups,"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0])) pairs_to_qa(npair_file,'pair',"{0}.all.nolocaldups.bed.local".format(qbed.path.split(".")[0]),"{0}.all.nolocaldups.bed.local".format(sbed.path.split(".")[0]),"{0}.raw.filtered.local".format(options.pairs.split(".")[0]))