def main(gff_file,bed_corrected): gff = Gff(gff_file) bed = Bed(bed_corrected) gffLength = gff.__len__() gff_index = 0 info, before = '','' fp=open(gff_file+'.modified','w') for index in range(gffLength): item = gff.__getitem__(index) info = item.accn if item.accn != 'UNDEF': if item.type == 'gene': gene_bed = bed.__getitem__(gff_index).__str__().split('\t')[3] gene_list = gene_bed.split('|') paralist = item.accn.split(';') print(gene_bed, paralist) if len(gene_list) == 1 and len(paralist)==2: gene1, gene2 = paralist[0].split(',')[0], paralist[1].split(',')[0] if gene_bed == gene1: item.__setattr__("accn", paralist[0]) before = paralist[0] elif gene_bed == gene2: item.__setattr__("accn", paralist[1]) before = paralist[1] else: before = item.accn else : before = item.accn gff_index = gff_index + 1 else: item.__setattr__("accn",before) else: pass fp.write(str(item)+'\n') fp.close()
def main(qbed_file, sbed_file, blast_file, pairs_file, out_fh, padding, query=True): if query: qbed = Bed(qbed_file) sbed = Bed(sbed_file).get_order() else: qbed = Bed(sbed_file) sbed = Bed(qbed_file).get_order() qorder = qbed.get_order() pairs, pairs_dict = get_pairs(pairs_file, query) if query: qorthos = list(set([qaccn for qaccn, saccn in pairs])) else: qorthos = list(set([saccn for qaccn, saccn in pairs])) qaccns = get_pos(qorder, qorthos) blasts = blast_grouped(blast_file, query) qaccns.sort() best_hits = [] for qi, q in enumerate(qaccns): if qi == 0: continue if qi == len(qaccns): continue left_ortho = qaccns[qi - 1] right_ortho = qaccns[qi] if (right_ortho - left_ortho) == 1: continue #### if new ortho - old is == 1 no orthos inbetween new_hits = get_best_hits(left_ortho, right_ortho, blasts, qbed, sbed, pairs_dict, padding) best_hits += new_hits write_best_hits(out_fh, best_hits) return best_hits
def main(qbed_file,sbed_file,blast_file,pairs_file,out_fh,padding,query=True): if query: qbed = Bed(qbed_file) sbed = Bed(sbed_file).get_order() else: qbed = Bed(sbed_file) sbed = Bed(qbed_file).get_order() qorder = qbed.get_order() pairs,pairs_dict = get_pairs(pairs_file,query) if query: qorthos = list(set([qaccn for qaccn,saccn in pairs])) else: qorthos = list(set([saccn for qaccn,saccn in pairs])) qaccns = get_pos(qorder,qorthos) qaccns.sort() flankers = [] for qi,q in enumerate(qaccns): if qi == 0: continue if qi == len(qaccns) - 1: continue left_pos = q -1 right_pos = q + 1 if left_pos in qaccns and right_pos in qaccns: #print qbed[q].accn flankers.append(qbed[q].accn) write_best_hits(out_fh,flankers) return flankers
def build_order(qbed_file, sbed_file): print >> sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() return qbed, sbed, qorder, sorder
class RawBed(object): """takes line from habos raw file and converts to brents bed line""" def __init__(self,bed): self.bed = Bed(bed) self.order = self.bed.get_order() def raw_to_bed(self,raw_pos): """returns the bed file for the raw_pos""" bed_info = self.bed[raw_pos] d = {} d['start'] = bed_info.start d['end'] = bed_info.end d['seqid'] = bed_info.seqid d['accn'] = bed_info.accn args = bed_info.stuff d['strand'] = args[1] #d['locs'] = loc_conv(args[-2],args[-1]) return d def accn_to_raw(self,accn): "returns the raw line inputs for accn" pos =self.order[accn][0] seqid = self.order[accn][1].seqid return pos,seqid
def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed sqlite = options.sqlite print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) all_data = [] for b in blasts: query, subject = b.query, b.subject if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] all_data.append((qi, si)) c = None if options.sqlite: conn = sqlite3.connect(options.sqlite) c = conn.cursor() c.execute("drop table if exists synteny") c.execute("create table synteny (query text, anchor text, gray varchar(1), score integer, dr integer, " "orientation varchar(1), qnote text, snote text)") batch_query(qbed, sbed, all_data, options, c=c, transpose=False) batch_query(qbed, sbed, all_data, options, c=c, transpose=True) if sqlite: c.execute("create index q on synteny (query)") conn.commit() c.close()
def write_nolocaldups(bed_path, localdups_file, out_name): bed = Bed(bed_path) children = [] for line in open(localdups_file): dups = DupLine(line) children += dups.children print >> sys.stderr, "write tandem-filtered bed file {0}".format(out_name) fh = open(out_name, "w") for i, row in enumerate(bed): if row['accn'] in children: continue print >> fh, row fh.close()
def build_order(qbed_file, sbed_file): print(sys.stderr, "Read annotation files %s and %s" % (qbed_file, sbed_file)) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() return qbed, sbed, qorder, sorder
def main(qfasta, sfasta, options): qfasta = Fasta(qfasta) sfasta = Fasta(sfasta) if not (options.qbed and options.sbed): anchors = PositionAnchor(options.anchors) else: qbed = Bed(options.qbed) sbed = Bed(options.sbed) anchors = Anchor(options.anchors, qbed, sbed) cpus = cpu_count() pool = Pool(cpus) for i, command_group in enumerate( anchors.gen_cmds(qfasta, sfasta, options.dist, options.cmd)): if not (i - 1) % 500: print >> sys.stderr, "complete: %.5f" % (( (i - 1.0) * cpus) / len(anchors)) for lines in pool.map(run_blast, command_group): for line in lines: line[6:10] = map(str, line[6:10]) print "\t".join(line)
def main(qbed_file,sbed_file,pairs_file,out,query=True): out_fh = open(out,"wb") if query: qbed = Bed(qbed_file) sbed = Bed(sbed_file).get_order() else: qbed = Bed(sbed_file) sbed = Bed(qbed_file).get_order() qorder = qbed.get_order() pairs,pairs_dict = get_pairs(pairs_file,query) if query: qorthos = list(set([qaccn for qaccn,saccn in pairs])) else: qorthos = list(set([saccn for qaccn,saccn in pairs])) qaccns = get_pos(qorder,qorthos) for qi in qaccns: ### had to change from int search to term because of issues with ### merging see Os12g12370 left_ortho = qbed[qi-1].accn in qorthos right_ortho = qbed[qi+1].accn in qorthos line = "{0}\t{1}\t{2}\n".format(qbed[qi].accn,left_ortho,right_ortho) out_fh.write(line) out_fh.close()
class RawBed(object): """takes line from habos raw file and converts to brents bed line""" def __init__(self, bed): self.bed = Bed(bed) self.order = self.bed.get_order() def raw_to_bed(self, raw_pos): """returns the bed file for the raw_pos""" bed_info = self.bed[raw_pos] d = {} d['start'] = bed_info.start d['end'] = bed_info.end d['seqid'] = bed_info.seqid d['accn'] = bed_info.accn args = bed_info.stuff d['strand'] = args[1] #d['locs'] = loc_conv(args[-2],args[-1]) return d def accn_to_raw(self, accn): "returns the raw line inputs for accn" pos = self.order[accn][0] seqid = self.order[accn][1].seqid return pos, seqid
def __init__(self,bed): self.bed = Bed(bed) self.order = self.bed.get_order()
def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >>sys.stderr, "... looks like a self-self BLAST to me" global_density_ratio = options.global_density_ratio tandem_Nmax = options.tandem_Nmax filter_repeats = options.filter_repeats cscore = options.cscore print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) fp2 = [] for x in fp: if x[0] == '#': continue fp2.append(x) blasts = sorted([BlastLine(line) for line in fp2], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = options.strip_names for b in blasts: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename) continue if subject not in sorder: print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename) continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q['seqid'], s['seqid'] filtered_blasts.append(b) if global_density_ratio: print >>sys.stderr, "running the global_density filter" + \ "(global_density_ratio=%d)..." % options.global_density_ratio gene_count = len(qorder) + len(sorder) before_filter = len(filtered_blasts) filtered_blasts = filter_to_global_density(filtered_blasts, gene_count, global_density_ratio) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if tandem_Nmax: print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") sdups_to_mother = write_localdups(sdups_fh, standems, sbed) if options.tandems_only: # just want to use this script as a tandem finder. sys.exit() # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) print >>sys.stderr, "after filter (%d->%d)..." % \ (before_filter, len(filtered_blasts)) qnew_name = "%s.nolocaldups%s" % op.splitext(qbed.filename) snew_name = "%s.nolocaldups%s" % op.splitext(sbed.filename) qbed_new = Bed(qnew_name) sbed_new = Bed(snew_name) qorder = qbed_new.get_order() sorder = sbed_new.get_order() if filter_repeats: before_filter = len(filtered_blasts) print >>sys.stderr, "running the repeat filter", filtered_blasts = list(filter_repeat(filtered_blasts)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if cscore: before_filter = len(filtered_blasts) print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) # this is the final output we will write to after BLAST filters raw_name = "%s.raw" % op.splitext(blast_file)[0] raw_fh = open(raw_name, "w") write_raw(qorder, sorder, filtered_blasts, raw_fh) if options.write_filtered_blast: write_new_blast(filtered_blasts)
ax.yaxis.set_major_formatter(formatter) plt.setp(ax.get_xticklabels() + ax.get_yticklabels(), color='gray', size=10) root.set_axis_off() print >> sys.stderr, "print image to %s" % image_name plt.savefig(image_name, dpi=600) if __name__ == "__main__": import optparse parser = optparse.OptionParser(__doc__) parser.add_option("--qbed", dest="qbed", help="path to qbed") parser.add_option("--sbed", dest="sbed", help="path to sbed") (options, args) = parser.parse_args() if not (len(args) == 1 and options.qbed and options.sbed): sys.exit(parser.print_help()) qbed = Bed(options.qbed) sbed = Bed(options.sbed) qa_file = args[0] image_name = op.splitext(qa_file)[0] + ".png" dotplot(qa_file, qbed, sbed, image_name)
def __init__(self, bed): self.bed = Bed(bed) self.order = self.bed.get_order()
def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >>sys.stderr, "... looks like a self-self BLAST to me" global_density_ratio = options.global_density_ratio tandem_Nmax = options.tandem_Nmax filter_repeats = options.filter_repeats cscore = options.cscore localdups = options.localdups print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) # mdb added 3/18/16 for Last v731 blasts = [] for line in fp: if not line.startswith("#"): blasts.append(BlastLine(line)) blasts = sorted(blasts, key=lambda b: b.score, reverse=True) # mdb removed 3/18/16 for Last v731 # blasts = sorted([BlastLine(line) for line in fp], \ # key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = options.strip_names for b in blasts: query, subject = b.query, b.subject #if ostrip: # query, subject = gene_name(query), gene_name(subject) if query not in qorder: print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename) continue if subject not in sorder: print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename) continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q['seqid'], s['seqid'] filtered_blasts.append(b) if global_density_ratio: print >>sys.stderr, "running the global_density filter" + \ "(global_density_ratio=%d)..." % options.global_density_ratio gene_count = len(qorder) + len(sorder) before_filter = len(filtered_blasts) filtered_blasts = filter_to_global_density(filtered_blasts, gene_count, global_density_ratio) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if tandem_Nmax: print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None sdups_to_mother = write_localdups(sdups_fh, standems, sbed) if localdups: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) print >>sys.stderr, "after filter (%d->%d)..." % \ (before_filter, len(filtered_blasts)) qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother] sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother] qorder = qbed.get_order() sorder = sbed.get_order() if filter_repeats: before_filter = len(filtered_blasts) print >>sys.stderr, "running the repeat filter", filtered_blasts = list(filter_repeat(filtered_blasts)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if cscore: before_filter = len(filtered_blasts) print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) # this is the final output we will write to after BLAST filters #raw_name = "%s.raw" % op.splitext(blast_file)[0] #raw_fh = open(raw_name, "w") #write_raw(qorder, sorder, filtered_blasts, raw_fh) write_new_blast(filtered_blasts)
def main(anchor_file, blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >>sys.stderr, "... looks like a self-self BLAST to me" print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() _ = lambda x: x.rsplit(".", 1)[0] fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() for b in blasts: query, subject = _(b.query), _(b.subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) all_anchors = collections.defaultdict(list) fp = file(anchor_file) for row in fp: if row[0]=='#': continue a, b = row.split() if a not in qorder or b not in sorder: continue qi, q = qorder[a] si, s = sorder[b] all_anchors[(q.seqid, s.seqid)].append((qi, si)) # grouping the hits based on chromosome pair for sending in find_nearby all_hits = collections.defaultdict(list) for b in filtered_blasts: all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si)) # select hits that are close to the anchor list j = 0 fw = sys.stdout for chr_pair in sorted(all_hits.keys()): hits = np.array(all_hits[chr_pair]) anchors = np.array(all_anchors[chr_pair]) print >>sys.stderr, chr_pair, len(anchors) if len(anchors)==0: continue tree = cKDTree(anchors, leafsize=16) #print tree.data dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist) #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n] for i, (dd, idx) in enumerate(zip(dists, idxs)): if dd==0: continue # same anchors if idx!=tree.n: qi, si = hits[i] query, subject = qbed[qi]["accn"], sbed[si]["accn"] print >>fw, "\t".join((query, subject, "lifted")) j+=1 print >>sys.stderr, j, "new pairs found"
def dotplot(ax, anchors, qbed, sbed, topn, axes): # modified from Haibao Tang's original function '''function to plot merged topn anchors.''' _ = lambda x: r"$\rm{%s}$" % x.replace(" ", r"\ ") get_order = lambda bed: dict( (f['accn'], (i, f)) for (i, f) in enumerate(bed)) get_len = lambda bed: sum([f.end for (i, f) in enumerate(bed)]) qbed = Bed(qbed) sbed = Bed(sbed) xmax, ymax = len(qbed), get_len(sbed) print xmax, ymax qorder = get_order(qbed) chr_len = [f.end for (i, f) in enumerate(sbed)] # get topn hits data = [] cur_q = "" for anchor in anchors: if anchor[0] != cur_q: n = 1 if n > topn: continue try: qgene = qorder[anchor[0].split('.')[0]] except: continue if 'r' in anchor[0]: continue anchor[4] += sum(chr_len[0:(int(anchor[1].lstrip('chr0')) - 1)]) if qgene[0] < xmax and anchor[4] < ymax: data.append((qgene[0], anchor[4])) cur_q = anchor[0] n += 1 print 'data length: ', len(data) x, y = zip(*data) x, y = np.array(x, 'f') / xmax, np.array(y, 'f') / ymax ax.scatter(x, y, c='b', s=.5, lw=0, alpha=.8) ax.get_xaxis().set_ticks([]) ax.get_yaxis().set_ticks([]) xchr_labels, ychr_labels = [], [] cbreaks = {} # plot the chromosome breaks for (seqid, beg, end) in get_breaks(qbed): if "random" in seqid: continue cbreaks[("query", seqid)] = (beg, end) xchr_labels.append((seqid, (beg + end) / 2)) x, y = np.array([beg, beg], 'f') / xmax, [0, 1] ax.plot(x, y, "-", color='y', alpha=.8, zorder=10) ax.add_patch( Rectangle((.998, 0), .002, 1, lw=.2, color='y', fc='y', fill=True, alpha=.8, zorder=10)) get_breaks_subject = lambda bed: [[f.accn, f.start, f.end] for (i, f) in enumerate(bed)] chr_cum = 0 for items in get_breaks_subject(sbed): seqid, beg, end = items beg += chr_cum end += chr_cum if "random" in seqid: continue cbreaks[("subject", seqid)] = (beg, end) ychr_labels.append((seqid, (beg + end) / 2)) x, y = [0, 1], np.array([beg, beg], 'f') / ymax ax.plot(x, y, "-", color='y', alpha=.8, zorder=10) chr_cum = end ax.add_patch( Rectangle((0, 1), 1, .002, lw=.2, color='y', fc='y', fill=True, alpha=.8, zorder=10)) # plot the chromosome labels for label, pos in xchr_labels: x, y = pos * 1. / xmax - .015, 1.02 if label[0] == "0": label = label.replace("0", "") ax.text(x, y, _("%s" % label)) for label, pos in ychr_labels: x, y = -.065, pos * 1. / ymax - .0065 if label[0] == "0": label = label.replace("0", "") ax.text(x, y, _("%s" % label)) # plot axis labels ax.text(.5, 1.06, _("%s" % axes[0])) ax.text(-.1, .5, _("%s" % axes[1]), rotation="vertical")
def main(anchor_file, blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >> sys.stderr, "... looks like a self-self BLAST to me" print >> sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() _ = lambda x: x.rsplit(".", 1)[0] fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) blasts = sorted([BlastLine(line) for line in fp], \ key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() for b in blasts: query, subject = _(b.query), _(b.subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) all_anchors = collections.defaultdict(list) fp = file(anchor_file) for row in fp: if row[0] == '#': continue a, b = row.split() if a not in qorder or b not in sorder: continue qi, q = qorder[a] si, s = sorder[b] all_anchors[(q.seqid, s.seqid)].append((qi, si)) # grouping the hits based on chromosome pair for sending in find_nearby all_hits = collections.defaultdict(list) for b in filtered_blasts: all_hits[(b.qseqid, b.sseqid)].append((b.qi, b.si)) # select hits that are close to the anchor list j = 0 fw = sys.stdout for chr_pair in sorted(all_hits.keys()): hits = np.array(all_hits[chr_pair]) anchors = np.array(all_anchors[chr_pair]) print >> sys.stderr, chr_pair, len(anchors) if len(anchors) == 0: continue tree = cKDTree(anchors, leafsize=16) #print tree.data dists, idxs = tree.query(hits, p=1, distance_upper_bound=options.dist) #print [(d, idx) for (d, idx) in zip(dists, idxs) if idx!=tree.n] for i, (dd, idx) in enumerate(zip(dists, idxs)): if dd == 0: continue # same anchors if idx != tree.n: qi, si = hits[i] query, subject = qbed[qi]["accn"], sbed[si]["accn"] print >> fw, "\t".join((query, subject, "lifted")) j += 1 print >> sys.stderr, j, "new pairs found"