def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >>sys.stderr, "... looks like a self-self BLAST to me" global_density_ratio = options.global_density_ratio tandem_Nmax = options.tandem_Nmax filter_repeats = options.filter_repeats cscore = options.cscore localdups = options.localdups print >>sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) # mdb added 3/18/16 for Last v731 blasts = [] for line in fp: if not line.startswith("#"): blasts.append(BlastLine(line)) blasts = sorted(blasts, key=lambda b: b.score, reverse=True) # mdb removed 3/18/16 for Last v731 # blasts = sorted([BlastLine(line) for line in fp], \ # key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = options.strip_names for b in blasts: query, subject = b.query, b.subject #if ostrip: # query, subject = gene_name(query), gene_name(subject) if query not in qorder: print >>sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename) continue if subject not in sorder: print >>sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename) continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q['seqid'], s['seqid'] filtered_blasts.append(b) if global_density_ratio: print >>sys.stderr, "running the global_density filter" + \ "(global_density_ratio=%d)..." % options.global_density_ratio gene_count = len(qorder) + len(sorder) before_filter = len(filtered_blasts) filtered_blasts = filter_to_global_density(filtered_blasts, gene_count, global_density_ratio) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if tandem_Nmax: print >>sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None sdups_to_mother = write_localdups(sdups_fh, standems, sbed) if localdups: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) print >>sys.stderr, "after filter (%d->%d)..." % \ (before_filter, len(filtered_blasts)) qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother] sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother] qorder = qbed.get_order() sorder = sbed.get_order() if filter_repeats: before_filter = len(filtered_blasts) print >>sys.stderr, "running the repeat filter", filtered_blasts = list(filter_repeat(filtered_blasts)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) if cscore: before_filter = len(filtered_blasts) print >>sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) print >>sys.stderr, "after filter (%d->%d)..." % (before_filter, len(filtered_blasts)) # this is the final output we will write to after BLAST filters #raw_name = "%s.raw" % op.splitext(blast_file)[0] #raw_fh = open(raw_name, "w") #write_raw(qorder, sorder, filtered_blasts, raw_fh) write_new_blast(filtered_blasts)
def main(blast_file, options): qbed_file, sbed_file = options.qbed, options.sbed # is this a self-self blast? is_self = (qbed_file == sbed_file) if is_self: print >> sys.stderr, "... looks like a self-self BLAST to me" global_density_ratio = options.global_density_ratio tandem_Nmax = options.tandem_Nmax filter_repeats = options.filter_repeats cscore = options.cscore localdups = options.localdups print >> sys.stderr, "read annotation files %s and %s" % (qbed_file, sbed_file) qbed = Bed(qbed_file) sbed = Bed(sbed_file) qorder = qbed.get_order() sorder = sbed.get_order() fp = file(blast_file) print >>sys.stderr, "read BLAST file %s (total %d lines)" % \ (blast_file, sum(1 for line in fp)) fp.seek(0) # mdb added 3/18/16 for Last v731 blasts = [] for line in fp: if not line.startswith("#"): blasts.append(BlastLine(line)) blasts = sorted(blasts, key=lambda b: b.score, reverse=True) # mdb removed 3/18/16 for Last v731 # blasts = sorted([BlastLine(line) for line in fp], \ # key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = options.strip_names for b in blasts: query, subject = b.query, b.subject #if ostrip: # query, subject = gene_name(query), gene_name(subject) if query not in qorder: print >> sys.stderr, "WARNING: %s not in %s" % (query, qbed.filename) continue if subject not in sorder: print >> sys.stderr, "WARNING: %s not in %s" % (subject, sbed.filename) continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q['seqid'], s['seqid'] filtered_blasts.append(b) if global_density_ratio: print >>sys.stderr, "running the global_density filter" + \ "(global_density_ratio=%d)..." % options.global_density_ratio gene_count = len(qorder) + len(sorder) before_filter = len(filtered_blasts) filtered_blasts = filter_to_global_density(filtered_blasts, gene_count, global_density_ratio) print >> sys.stderr, "after filter (%d->%d)..." % ( before_filter, len(filtered_blasts)) if tandem_Nmax: print >> sys.stderr, "running the local dups filter (tandem_Nmax=%d)..." % tandem_Nmax qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(qbed_file)[0] + ".localdups", "w") if localdups else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qdups_fh, qtandems, qbed) sdups_fh = open(op.splitext(sbed_file)[0] + ".localdups", "w") if localdups else None sdups_to_mother = write_localdups(sdups_fh, standems, sbed) if localdups: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) print >>sys.stderr, "after filter (%d->%d)..." % \ (before_filter, len(filtered_blasts)) qbed.beds = [x for x in qbed if x["accn"] not in qdups_to_mother] sbed.beds = [x for x in sbed if x["accn"] not in sdups_to_mother] qorder = qbed.get_order() sorder = sbed.get_order() if filter_repeats: before_filter = len(filtered_blasts) print >> sys.stderr, "running the repeat filter", filtered_blasts = list(filter_repeat(filtered_blasts)) print >> sys.stderr, "after filter (%d->%d)..." % ( before_filter, len(filtered_blasts)) if cscore: before_filter = len(filtered_blasts) print >> sys.stderr, "running the cscore filter (cscore>=%.2f)..." % cscore filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) print >> sys.stderr, "after filter (%d->%d)..." % ( before_filter, len(filtered_blasts)) # this is the final output we will write to after BLAST filters #raw_name = "%s.raw" % op.splitext(blast_file)[0] #raw_fh = open(raw_name, "w") #write_raw(qorder, sorder, filtered_blasts, raw_fh) write_new_blast(filtered_blasts)