def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) \ + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join((gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def prepare_synteny(tourfile, lastfile, odir, p, opts): """ Prepare synteny plots for movie(). """ qbedfile, sbedfile = get_bed_filenames(lastfile, p, opts) qbedfile = op.abspath(qbedfile) sbedfile = op.abspath(sbedfile) qbed = Bed(qbedfile, sorted=False) contig_to_beds = dict(qbed.sub_beds()) # Create a separate directory for the subplots and movie mkdir(odir, overwrite=True) os.chdir(odir) logging.debug("Change into subdir `{}`".format(odir)) # Make anchorsfile anchorsfile = ".".join(op.basename(lastfile).split(".", 2)[:2]) + ".anchors" fw = open(anchorsfile, "w") for b in Blast(lastfile): print >> fw, "\t".join( (gene_name(b.query), gene_name(b.subject), str(int(b.score)))) fw.close() # Symlink sbed symlink(sbedfile, op.basename(sbedfile)) return anchorsfile, qbedfile, contig_to_beds
def iadhore(args): """ %prog iadhore athaliana.athaliana.last athaliana.bed Wrap around iADHoRe. """ p = OptionParser(iadhore.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] blast_table = "blast_table.txt" fp = open(lastfile) seen = set() for row in fp: c = BlastLine(row) a, b = c.query, c.subject a, b = gene_name(a), gene_name(b) if a > b: a, b = b, a seen.add((a, b)) fw = open(blast_table, "w") for a, b in seen: print >> fw, "\t".join((a, b)) fw.close() logging.debug("A total of {0} pairs written to `{1}`"\ .format(len(seen), blast_table)) fw = open("config.txt", "w") for bedfile in bedfiles: pf, stanza = write_lst(bedfile) print >> fw, "genome={0}".format(pf) for seqid, fname in stanza: print >> fw, " ".join((seqid, fname)) print >> fw print >> fw, "blast_table={0}".format(blast_table) print >> fw, "cluster_type=colinear" print >> fw, "tandem_gap=10" print >> fw, "prob_cutoff=0.001" print >> fw, "gap_size=20" print >> fw, "cluster_gap=20" print >> fw, "q_value=0.9" print >> fw, "anchor_points=4" print >> fw, "alignment_method=gg2" print >> fw, "max_gaps_in_alignment=20" print >> fw, "output_path=i-adhore_out" print >> fw, "number_of_threads=4" fw.close()
def iadhore(args): """ %prog iadhore athaliana.athaliana.last athaliana.bed Wrap around iADHoRe. """ p = OptionParser(iadhore.__doc__) opts, args = p.parse_args(args) if len(args) < 2: sys.exit(not p.print_help()) lastfile = args[0] bedfiles = args[1:] blast_table = "blast_table.txt" fp = open(lastfile) seen = set() for row in fp: c = BlastLine(row) a, b = c.query, c.subject a, b = gene_name(a), gene_name(b) if a > b: a, b = b, a seen.add((a, b)) fw = open(blast_table, "w") for a, b in seen: print("\t".join((a, b)), file=fw) fw.close() logging.debug("A total of {0} pairs written to `{1}`"\ .format(len(seen), blast_table)) fw = open("config.txt", "w") for bedfile in bedfiles: pf, stanza = write_lst(bedfile) print("genome={0}".format(pf), file=fw) for seqid, fname in stanza: print(" ".join((seqid, fname)), file=fw) print(file=fw) print("blast_table={0}".format(blast_table), file=fw) print("cluster_type=colinear", file=fw) print("tandem_gap=10", file=fw) print("prob_cutoff=0.001", file=fw) print("gap_size=20", file=fw) print("cluster_gap=20", file=fw) print("q_value=0.9", file=fw) print("anchor_points=4", file=fw) print("alignment_method=gg2", file=fw) print("max_gaps_in_alignment=20", file=fw) print("output_path=i-adhore_out", file=fw) print("number_of_threads=4", file=fw) fw.close()
def __init__(self, row, strip_names=False): args = row.strip().split(",") self.name = args[0] self.yn_ks = self.get_float(args[1]) self.yn_ka = self.get_float(args[2]) self.ng_ks = self.get_float(args[3]) self.ng_ka = self.get_float(args[4]) self.ks = self.ng_ks if ";" in self.name: self.gene_a, self.gene_b = self.name.split(";") if strip_names: self.gene_a = gene_name(self.gene_a) self.gene_b = gene_name(self.gene_b)
def read_ks_file(ks_file, strip_names=False): from jcvi.utils.cbook import gene_name reader = csv.reader(open(ks_file, "rb")) data = [] for row in reader: if row[0] == "name": # header continue for i, a in enumerate(row): if i == 0: if strip_names: row[i] = ";".join(gene_name(x) for x in row[i].split(";")) continue try: row[i] = float(row[i]) except: row[i] = -1 data.append(KsLine._make(row)) logging.debug('File `{0}` contains a total of {1} gene pairs'.\ format(ks_file, len(data))) return data
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True): """ read the blast and convert name into coordinates """ filtered_blast = [] seen = set() bl = Blast(blast_file) for b in bl: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self: # remove redundant a<->b to one side when doing self-self BLAST if qi > si: query, subject = subject, query qi, si = si, qi q, s = s, q # Too close to diagonal! possible tandem repeats if q.seqid == s.seqid and si - qi < 40: continue key = query, subject if key in seen: continue seen.add(key) b.qseqid, b.sseqid = q.seqid, s.seqid b.qi, b.si = qi, si b.query, b.subject = query, subject filtered_blast.append(b) logging.debug("A total of {0} BLAST imported from `{1}`.".\ format(len(filtered_blast), blast_file)) return filtered_blast
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family))
def main(blast_file, cds_file, bed_file, N=3): # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file).order # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) / 2: continue query, subject = gene_name(b.query), gene_name(b.subject) qi, q = bed[query] si, s = bed[subject] if q.seqid == s.seqid and abs(qi - si) <= N: g.join(query, subject) # dump the grouper ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family))
def read_blast(blast_file, qorder, sorder, is_self=False, ostrip=True): """ read the blast and convert name into coordinates """ fp = open(blast_file) filtered_blast = [] seen = set() for row in fp: b = BlastLine(row) query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder or subject not in sorder: continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # remove redundant a<->b to one side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.qseqid, b.sseqid = q.seqid, s.seqid b.qi, b.si = qi, si b.query, b.subject = query, subject filtered_blast.append(b) logging.debug("A total of {0} BLAST imported from `{1}`.".\ format(len(filtered_blast), blast_file)) return filtered_blast
def some(args): """ %prog some bedfile idsfile > newbedfile Retrieve a subset of bed features given a list of ids. """ from jcvi.formats.base import SetFile from jcvi.utils.cbook import gene_name p = OptionParser(some.__doc__) p.add_option("-v", dest="inverse", default=False, action="store_true", help="Get the inverse, like grep -v [default: %default]") p.set_outfile() p.set_stripnames() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, idsfile = args inverse = opts.inverse ostrip = opts.strip_names fw = must_open(opts.outfile, "w") ids = SetFile(idsfile) if ostrip: ids = set(gene_name(x) for x in ids) bed = Bed(bedfile) ntotal = nkeep = 0 for b in bed: ntotal += 1 keep = b.accn in ids if inverse: keep = not keep if keep: nkeep += 1 print >> fw, b fw.close() logging.debug("Stats: {0} features kept.".\ format(percentage(nkeep, ntotal)))
def anchorline(self): return "\t".join((gene_name(self.gene_a), gene_name(self.gene_b), "{:.3f}".format(self.ks)))
def benchmark(args): """ %prog benchmark at bedfile Compare SynFind, MCScanx, iADHoRe and OrthoFinder against the truth. """ p = OptionParser(benchmark.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pf, bedfile = args truth = pf + ".truth" synfind = pf + ".synfind" mcscanx = pf + ".mcscanx" iadhore = pf + ".iadhore" orthofinder = pf + ".orthofinder" pivots = set([x.accn for x in Bed(bedfile)]) fp = open(truth) truth = set() for row in fp: a, b = row.strip().split("\t")[:2] pivots.add(a) truth.add(tuple(sorted((a, b)))) logging.debug("Truth: {0} pairs".format(len(truth))) fp = open(synfind) benchmarkfile = pf + ".benchmark" fw = must_open(benchmarkfile, "w") synfind = set() for row in fp: atoms = row.strip().split("\t") query, hit, tag = atoms[:3] if tag != "S": continue synfind.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(synfind, truth, "SynFind", fw) fp = open(mcscanx) mcscanx = set() for row in fp: if row[0] == '#': continue atoms = row.strip().split(":")[1].split() query, hit = atoms[:2] mcscanx.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(mcscanx, truth, "MCScanX", fw) fp = open(iadhore) iadhore = set() next(fp) for row in fp: atoms = row.strip().split("\t") query, hit = atoms[3:5] iadhore.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(iadhore, truth, "iADHoRe", fw) fp = open(orthofinder) orthofinder = set() next(fp) for row in fp: row = row.replace('"', "") atoms = row.replace(",", " ").split() genes = [x.strip() for x in atoms if not x.startswith("OG")] genes = [gene_name(x) for x in genes] pps = [x for x in genes if x in pivots] for p in pps: for g in genes: if p == g: continue orthofinder.add(tuple(sorted((p, g)))) #write_pairs(orthofinder, "orthofinder.pairs") calc_sensitivity_specificity(orthofinder, truth, "OrthoFinder", fw) fw.close()
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len)*P/100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N+1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i-x].accn] if abs(leni - lenx) > max(leni, lenx)*(1-P/100.): continue g.join(bed[i-x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >>fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >>sys.stderr, "Proximal paralogues (dist=%d):" % N print >>sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >>sys.stderr, "Longest families (%d): %s" % (len(longest_family), ",".join(longest_family)) return families
def test_gene_name(input, output): from jcvi.utils.cbook import gene_name assert gene_name(input) == output
def loss(args): """ %prog loss a.b.i1.blocks [a.b-genomic.blast] Extract likely gene loss candidates between genome a and b. """ p = OptionParser(loss.__doc__) p.add_option("--bed", default=False, action="store_true", help="Genomic BLAST is in bed format [default: %default]") p.add_option("--gdist", default=20, type="int", help="Gene distance [default: %default]") p.add_option("--bdist", default=20000, type="int", help="Base pair distance [default: %default]") p.set_beds() opts, args = p.parse_args(args) if len(args) not in (1, 2): sys.exit(not p.print_help()) blocksfile = args[0] emptyblast = (len(args) == 1) if emptyblast: genomicblast = "empty.blast" sh("touch {0}".format(genomicblast)) else: genomicblast = args[1] gdist, bdist = opts.gdist, opts.bdist qbed, sbed, qorder, sorder, is_self = check_beds(blocksfile, p, opts) blocks = [] fp = open(blocksfile) genetrack = {} proxytrack = {} for row in fp: a, b = row.split() genetrack[a] = b blocks.append((a, b)) data = [] for key, rows in groupby(blocks, key=lambda x: x[-1]): rows = list(rows) data.append((key, rows)) imax = len(data) - 1 for i, (key, rows) in enumerate(data): if i == 0 or i == imax: continue if key != '.': continue before, br = data[i - 1] after, ar = data[i + 1] bi, bx = sorder[before] ai, ax = sorder[after] dist = abs(bi - ai) if bx.seqid != ax.seqid or dist > gdist: continue start, end = range_minmax(((bx.start, bx.end), (ax.start, ax.end))) start, end = max(start - bdist, 1), end + bdist proxy = (bx.seqid, start, end) for a, b in rows: proxytrack[a] = proxy tags = {} if opts.bed: bed = Bed(genomicblast, sorted=False) key = lambda x: gene_name(x.accn.rsplit(".", 1)[0]) for query, bb in groupby(bed, key=key): bb = list(bb) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.seqid, b.start, b.end) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.seqid, best_b.start, best_b.end) proxytrack[query] = hsp tags[query] = tag else: blast = Blast(genomicblast) for query, bb in blast.iter_hits(): bb = list(bb) query = gene_name(query) if query not in proxytrack: continue proxy = proxytrack[query] tag = "NS" best_b = bb[0] for b in bb: hsp = (b.subject, b.sstart, b.sstop) if range_overlap(proxy, hsp): tag = "S" best_b = b break hsp = (best_b.subject, best_b.sstart, best_b.sstop) proxytrack[query] = hsp tags[query] = tag for b in qbed: accn = b.accn target_region = genetrack[accn] if accn in proxytrack: target_region = region_str(proxytrack[accn]) if accn in tags: ptag = "[{0}]".format(tags[accn]) else: ptag = "[NF]" target_region = ptag + target_region print "\t".join((b.seqid, accn, target_region)) if emptyblast: sh("rm -f {0}".format(genomicblast))
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option( "--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing", ) p.add_option( "--flank", default=2000, type="int", help="Get the seq of size on two ends", ) p.add_option( "--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced", ) opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print( "\t".join( str(x) for x in (chr, leftb.start, rightb.end, gb.accn) ), file=fwp, ) print(leftb, file=fwe) print(gb, file=fwe) print(rightb, file=fwe) print( "L:{0} R:{1} [{2}]".format( distance_to_left, distance_to_right, label ), file=fwe, ) print(gb.accn, file=fw_ids) continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print("\t".join(str(x) for x in (chr, rmin - 1, rmax, name)), file=fw) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def pastegenes(args): """ %prog pastegenes coverage.list old.genes.bed new.genes.bed old.assembly Paste in zero or low coverage genes. For a set of neighboring genes missing, add the whole cassette as unplaced scaffolds. For singletons the program will try to make a patch. """ from jcvi.formats.base import DictFile from jcvi.utils.cbook import gene_name p = OptionParser(pastegenes.__doc__) p.add_option("--cutoff", default=90, type="int", help="Coverage cutoff to call gene missing [default: %default]") p.add_option("--flank", default=2000, type="int", help="Get the seq of size on two ends [default: %default]") p.add_option("--maxsize", default=50000, type="int", help="Maximum size of patchers to be replaced [default: %default]") opts, args = p.parse_args(args) if len(args) != 4: sys.exit(not p.print_help()) coveragefile, oldbed, newbed, oldassembly = args cutoff = opts.cutoff flank = opts.flank maxsize = opts.maxsize coverage = DictFile(coveragefile, valuepos=2, cast=float) obed = Bed(oldbed) order = obed.order bed = [x for x in obed if x.accn in coverage] key = lambda x: coverage[x.accn] >= cutoff extrabed = "extra.bed" extendbed = "extend.bed" pastebed = "paste.bed" fw = open(extrabed, "w") fwe = open(extendbed, "w") fwp = open(pastebed, "w") fw_ids = open(extendbed + ".ids", "w") singletons, large, large_genes = 0, 0, 0 for chr, chrbed in groupby(bed, key=lambda x: x.seqid): chrbed = list(chrbed) for good, beds in groupby(chrbed, key=key): if good: continue beds = list(beds) blocksize = len(set([gene_name(x.accn) for x in beds])) if blocksize == 1: singletons += 1 accn = beds[0].accn gi, gb = order[accn] leftb = obed[gi - 1] rightb = obed[gi + 1] leftr = leftb.range rightr = rightb.range cur = gb.range distance_to_left, oo = range_distance(leftr, cur) distance_to_right, oo = range_distance(cur, rightr) span, oo = range_distance(leftr, rightr) if distance_to_left <= distance_to_right and \ distance_to_left > 0: label = "LEFT" else: label = "RIGHT" if 0 < span <= maxsize: print >> fwp, "\t".join(str(x) for x in \ (chr, leftb.start, rightb.end, gb.accn)) print >> fwe, leftb print >> fwe, gb print >> fwe, rightb print >> fwe, "L:{0} R:{1} [{2}]".format(distance_to_left, \ distance_to_right, label) print >> fw_ids, gb.accn continue large += 1 large_genes += blocksize ranges = [(x.start, x.end) for x in beds] rmin, rmax = range_minmax(ranges) rmin -= flank rmax += flank name = "-".join((beds[0].accn, beds[-1].accn)) print >> fw, "\t".join(str(x) for x in (chr, rmin - 1, rmax, name)) fw.close() fwe.close() extrabed = mergeBed(extrabed, d=flank, nms=True) fastaFromBed(extrabed, oldassembly, name=True) summary([extrabed]) logging.debug("Singleton blocks : {0}".format(singletons)) logging.debug("Large blocks : {0} ({1} genes)".format(large, large_genes))
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore fp = open(blast_file) total_lines = sum(1 for line in fp if line[0] != '#') logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) bl = Blast(blast_file) blasts = sorted(list(bl), key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if tandem_Nmax: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. #sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()
def benchmark(args): """ %prog benchmark at bedfile Compare SynFind, MCScanx, iADHoRe and OrthoFinder against the truth. """ p = OptionParser(benchmark.__doc__) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) pf, bedfile = args truth = pf + ".truth" synfind = pf + ".synfind" mcscanx = pf + ".mcscanx" iadhore = pf + ".iadhore" orthofinder = pf + ".orthofinder" pivots = set([x.accn for x in Bed(bedfile)]) fp = open(truth) truth = set() for row in fp: a, b = row.strip().split("\t")[:2] pivots.add(a) truth.add(tuple(sorted((a, b)))) logging.debug("Truth: {0} pairs".format(len(truth))) fp = open(synfind) benchmarkfile = pf + ".benchmark" fw = must_open(benchmarkfile, "w") synfind = set() for row in fp: atoms = row.strip().split("\t") query, hit, tag = atoms[:3] if tag != "S": continue synfind.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(synfind, truth, "SynFind", fw) fp = open(mcscanx) mcscanx = set() for row in fp: if row[0] == '#': continue atoms = row.strip().split(":")[1].split() query, hit = atoms[:2] mcscanx.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(mcscanx, truth, "MCScanX", fw) fp = open(iadhore) iadhore = set() fp.next() for row in fp: atoms = row.strip().split("\t") query, hit = atoms[3:5] iadhore.add(tuple(sorted((query, hit)))) calc_sensitivity_specificity(iadhore, truth, "iADHoRe", fw) fp = open(orthofinder) orthofinder = set() fp.next() for row in fp: row = row.replace('"', "") atoms = row.replace(",", " ").split() genes = [x.strip() for x in atoms if not x.startswith("OG")] genes = [gene_name(x) for x in genes] pps = [x for x in genes if x in pivots] for p in pps: for g in genes: if p == g: continue orthofinder.add(tuple(sorted((p, g)))) #write_pairs(orthofinder, "orthofinder.pairs") calc_sensitivity_specificity(orthofinder, truth, "OrthoFinder", fw) fw.close()
def tandem_main(blast_file, cds_file, bed_file, N=3, P=50, is_self=True, \ evalue=.01, strip_name=".", ofile=sys.stderr, genefam=False): if genefam: N = 1e5 # get the sizes for the CDS first f = Fasta(cds_file) sizes = dict(f.itersizes()) # retrieve the locations bed = Bed(bed_file) order = bed.order if is_self: # filter the blast file g = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) qi, q = order[query] si, s = order[subject] if abs(qi - si) <= N and b.evalue <= evalue: if genefam: g.join(query, subject) elif q.seqid == s.seqid: g.join(query, subject) else: homologs = Grouper() fp = open(blast_file) for row in fp: b = BlastLine(row) query_len = sizes[b.query] subject_len = sizes[b.subject] if b.hitlen < min(query_len, subject_len) * P / 100.: continue if b.evalue > evalue: continue query = gene_name(b.query, strip_name) subject = gene_name(b.subject, strip_name) homologs.join(query, subject) if genefam: g = homologs else: g = Grouper() for i, atom in enumerate(bed): for x in range(1, N + 1): if all([i-x >= 0, bed[i-x].seqid == atom.seqid, \ homologs.joined(bed[i-x].accn, atom.accn)]): leni = sizes[bed[i].accn] lenx = sizes[bed[i - x].accn] if abs(leni - lenx) > max(leni, lenx) * (1 - P / 100.): continue g.join(bed[i - x].accn, atom.accn) # dump the grouper fw = must_open(ofile, "w") ngenes, nfamilies = 0, 0 families = [] for group in sorted(g): if len(group) >= 2: print >> fw, ",".join(sorted(group)) ngenes += len(group) nfamilies += 1 families.append(sorted(group)) longest_family = max(families, key=lambda x: len(x)) # generate reports print >> sys.stderr, "Proximal paralogues (dist=%d):" % N print >> sys.stderr, "Total %d genes in %d families" % (ngenes, nfamilies) print >> sys.stderr, "Longest families (%d): %s" % ( len(longest_family), ",".join(longest_family)) return families
def cscore(args): """ %prog cscore blastfile > cscoreOut See supplementary info for sea anemone genome paper, C-score formula: cscore(A,B) = score(A,B) / max(best score for A, best score for B) A C-score of one is the same as reciprocal best hit (RBH). Output file will be 3-column (query, subject, cscore). Use --cutoff to select a different cutoff. """ from jcvi.utils.cbook import gene_name p = OptionParser(cscore.__doc__) p.add_option("--cutoff", default=.9999, type="float", help="Minimum C-score to report [default: %default]") p.add_option("--pct", default=False, action="store_true", help="Also include pct as last column [default: %default]") p.add_option("--writeblast", default=False, action="store_true", help="Also write filtered blast file [default: %default]") p.set_stripnames() p.set_outfile() opts, args = p.parse_args(args) ostrip = opts.strip_names writeblast = opts.writeblast outfile = opts.outfile if len(args) != 1: sys.exit(not p.print_help()) blastfile, = args blast = Blast(blastfile) logging.debug("Register best scores ..") best_score = defaultdict(float) for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score if score > best_score[query]: best_score[query] = score if score > best_score[subject]: best_score[subject] = score blast = Blast(blastfile) pairs = {} cutoff = opts.cutoff for b in blast: query, subject = b.query, b.subject if ostrip: query, subject = gene_name(query), gene_name(subject) score = b.score pctid = b.pctid s = score / max(best_score[query], best_score[subject]) if s > cutoff: pair = (query, subject) if pair not in pairs or s > pairs[pair][0]: pairs[pair] = (s, pctid, b) fw = must_open(outfile, "w") if writeblast: fwb = must_open(outfile + ".filtered.blast", "w") pct = opts.pct for (query, subject), (s, pctid, b) in sorted(pairs.items()): args = [query, subject, "{0:.2f}".format(s)] if pct: args.append("{0:.1f}".format(pctid)) print >> fw, "\t".join(args) if writeblast: print >> fwb, b fw.close() if writeblast: fwb.close()
def blastfilter_main(blast_file, p, opts): qbed, sbed, qorder, sorder, is_self = check_beds(blast_file, p, opts) tandem_Nmax = opts.tandem_Nmax cscore = opts.cscore fp = open(blast_file) total_lines = sum(1 for line in fp if line[0] != '#') logging.debug("Load BLAST file `%s` (total %d lines)" % \ (blast_file, total_lines)) bl = Blast(blast_file) blasts = sorted(list(bl), key=lambda b: b.score, reverse=True) filtered_blasts = [] seen = set() ostrip = opts.strip_names nwarnings = 0 for b in blasts: query, subject = b.query, b.subject if query == subject: continue if ostrip: query, subject = gene_name(query), gene_name(subject) if query not in qorder: if nwarnings < 100: logging.warning("{0} not in {1}".format(query, qbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue if subject not in sorder: if nwarnings < 100: logging.warning("{0} not in {1}".format( subject, sbed.filename)) elif nwarnings == 100: logging.warning("too many warnings.. suppressed") nwarnings += 1 continue qi, q = qorder[query] si, s = sorder[subject] if is_self and qi > si: # move all hits to same side when doing self-self BLAST query, subject = subject, query qi, si = si, qi q, s = s, q key = query, subject if key in seen: continue seen.add(key) b.query, b.subject = key b.qi, b.si = qi, si b.qseqid, b.sseqid = q.seqid, s.seqid filtered_blasts.append(b) if cscore: before_filter = len(filtered_blasts) logging.debug("running the cscore filter (cscore>=%.2f) .." % cscore) filtered_blasts = list(filter_cscore(filtered_blasts, cscore=cscore)) logging.debug("after filter (%d->%d) .." % (before_filter, len(filtered_blasts))) if tandem_Nmax: logging.debug("running the local dups filter (tandem_Nmax=%d) .." % \ tandem_Nmax) qtandems = tandem_grouper(qbed, filtered_blasts, flip=True, tandem_Nmax=tandem_Nmax) standems = tandem_grouper(sbed, filtered_blasts, flip=False, tandem_Nmax=tandem_Nmax) qdups_fh = open(op.splitext(opts.qbed)[0] + ".localdups", "w") \ if opts.tandems_only else None if is_self: for s in standems: qtandems.join(*s) qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_to_mother = qdups_to_mother else: qdups_to_mother = write_localdups(qtandems, qbed, qdups_fh) sdups_fh = open(op.splitext(opts.sbed)[0] + ".localdups", "w") \ if opts.tandems_only else None sdups_to_mother = write_localdups(standems, sbed, sdups_fh) if opts.tandems_only: # write out new .bed after tandem removal write_new_bed(qbed, qdups_to_mother) if not is_self: write_new_bed(sbed, sdups_to_mother) # just want to use this script as a tandem finder. #sys.exit() before_filter = len(filtered_blasts) filtered_blasts = list(filter_tandem(filtered_blasts, \ qdups_to_mother, sdups_to_mother)) logging.debug("after filter (%d->%d) .." % \ (before_filter, len(filtered_blasts))) blastfilteredfile = blast_file + ".filtered" fw = open(blastfilteredfile, "w") write_new_blast(filtered_blasts, fh=fw) fw.close()