def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder): samplefq = [] for i in range(2): samplefq.append( op.join(work, prefix + "_{0}.first.fastq".format(i + 1))) first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]]) os.chdir(work) align_args = [ref] + [op.basename(fq) for fq in samplefq] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def pairs(args): """ %prog pairs folder reference.fasta Estimate insert size distribution. Compatible with a variety of aligners, including CLC, BOWTIE and BWA. """ p = OptionParser(pairs.__doc__) p.set_firstN() p.set_mates() p.set_aligner() opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) cwd = os.getcwd() aligner = opts.aligner work = "-".join(("pairs", aligner)) mkdir(work) if aligner == "clc": from jcvi.apps.clc import align from jcvi.formats.cas import pairs as ps else: from jcvi.formats.sam import pairs as ps if aligner == "bowtie": from jcvi.apps.bowtie import align elif aligner == "bwa": from jcvi.apps.bwa import align folder, ref = args ref = get_abs_path(ref) messages = [] for p, prefix in iter_project(folder, 2): samplefq = op.join(work, prefix + ".first.fastq") first([str(opts.firstN)] + p + ["-o", samplefq]) os.chdir(work) align_args = [ref, op.basename(samplefq)] outfile, logfile = align(align_args) bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)]) os.chdir(cwd) median = stats.median tag = "MP" if median > 1000 else "PE" median = str(median) pf, sf = median[:2], median[2:] if sf and int(sf) != 0: pf = str(int(pf) + 1) # Get the first two effective digits lib = "{0}-{1}".format(tag, pf + "0" * len(sf)) for i, xp in enumerate(p): suffix = "fastq.gz" if xp.endswith(".gz") else "fastq" link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix) m = "\t".join(str(x) for x in (xp, link)) messages.append(m) messages = "\n".join(messages) write_file("f.meta", messages, tee=True)
def mates(args): """ %prog mates bedfile Generate the mates file by inferring from the names. """ p = OptionParser(mates.__doc__) p.add_option( "--lib", default=False, action="store_true", help="Output library information along with pairs [default: %default]") p.add_option( "--nointra", default=False, action="store_true", help="Remove mates that are intra-scaffold [default: %default]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]" ) p.set_mates() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args rclip = opts.rclip key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) bed = Bed(bedfile, key=key) pf = bedfile.rsplit(".", 1)[0] matesfile = pf + ".mates" lib = pf if opts.lib else None fw = open(matesfile, "w") if lib: bedfile, stats = pairs([bedfile, \ "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff)]) sv = int(2 * stats.sd) mindist = max(stats.mean - sv, 1) maxdist = stats.mean + sv print >> fw, "\t".join(str(x) for x in \ ("library", pf, mindist, maxdist)) num_fragments = num_pairs = 0 matesbedfile = matesfile + ".bed" fwm = open(matesbedfile, "w") for pe, lines in groupby(bed, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue a, b = lines if opts.nointra and a.seqid == b.seqid: continue # Use --prefix to limit the links between seqids with the same prefix # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002 if opts.prefix: aprefix = a.seqid.split("_")[0] bprefix = b.seqid.split("_")[0] if aprefix != bprefix: continue num_pairs += 1 pair = [a.accn, b.accn] if lib: pair.append(lib) print >> fw, "\t".join(pair) print >> fwm, a print >> fwm, b logging.debug("Discard {0} frags and write {1} pairs to `{2}` and `{3}`.".\ format(num_fragments, num_pairs, matesfile, matesbedfile)) fw.close() fwm.close() return matesfile, matesbedfile
def mates(args): """ %prog mates bedfile Generate the mates file by inferring from the names. """ p = OptionParser(mates.__doc__) p.add_option("--lib", default=False, action="store_true", help="Output library information along with pairs [default: %default]") p.add_option("--nointra", default=False, action="store_true", help="Remove mates that are intra-scaffold [default: %default]") p.add_option("--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]") p.set_mates() opts, args = p.parse_args(args) if len(args) != 1: sys.exit(not p.print_help()) bedfile, = args rclip = opts.rclip key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn) bed = Bed(bedfile, key=key) pf = bedfile.rsplit(".", 1)[0] matesfile = pf + ".mates" lib = pf if opts.lib else None fw = open(matesfile, "w") if lib: bedfile, stats = pairs([bedfile, \ "--rclip={0}".format(rclip), "--cutoff={0}".format(opts.cutoff)]) sv = int(2 * stats.sd) mindist = max(stats.mean - sv, 1) maxdist = stats.mean + sv print >> fw, "\t".join(str(x) for x in \ ("library", pf, mindist, maxdist)) num_fragments = num_pairs = 0 matesbedfile = matesfile + ".bed" fwm = open(matesbedfile, "w") for pe, lines in groupby(bed, key=key): lines = list(lines) if len(lines) != 2: num_fragments += len(lines) continue a, b = lines if opts.nointra and a.seqid == b.seqid: continue # Use --prefix to limit the links between seqids with the same prefix # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002 if opts.prefix: aprefix = a.seqid.split("_")[0] bprefix = b.seqid.split("_")[0] if aprefix != bprefix: continue num_pairs += 1 pair = [a.accn, b.accn] if lib: pair.append(lib) print >> fw, "\t".join(pair) print >> fwm, a print >> fwm, b logging.debug("Discard {0} frags and write {1} pairs to `{2}` and `{3}`.".\ format(num_fragments, num_pairs, matesfile, matesbedfile)) fw.close() fwm.close() return matesfile, matesbedfile
def link(args): """ %prog link bedfile fastafile Construct contig links based on bed file. Use --prefix to limit the links between contigs that start with the same prefix_xxx. """ p = OptionParser(link.__doc__) p.set_mates(rclip=1, mateorientation="+-") p.add_option("--insert", type="int", default=0, help="Mean insert size [default: estimate from data]") p.add_option( "--prefix", default=False, action="store_true", help="Only keep links between IDs with same prefix [default: %default]", ) p.add_option( "--debug", dest="debug", default=False, action="store_true", help="Print verbose info when checking mates [default: %default]", ) opts, args = p.parse_args(args) if len(args) != 2: sys.exit(not p.print_help()) bedfile, fastafile = args debug = opts.debug cutoff = opts.cutoff sizes = Sizes(fastafile) cutoffopt = "--cutoff={0}".format(cutoff) mateorientationopt = "--mateorientation={0}".format(opts.mateorientation) bedfile, stats = pairs([bedfile, cutoffopt, mateorientationopt, "--rclip={0}".format(opts.rclip)]) maxcutoff = cutoff or stats.p2 insert = opts.insert or stats.median logging.debug("Mate hangs must be <= {0}, --cutoff to override".format(maxcutoff)) rs = lambda x: x.accn[:-1] fp = open(bedfile) linksfile = bedfile.rsplit(".", 1)[0] + ".links" fw = open(linksfile, "w") for a, b in pairwise(fp): """ Criteria for valid contig edge 1. for/rev do not mapping to the same scaffold (useful for linking) 2. assuming innie (outie must be flipped first), order the contig pair 3. calculate sequence hangs, valid hangs are smaller than insert size """ a, b = BedLine(a), BedLine(b) if rs(a) != rs(b): continue pe = rs(a) # Intra-contig links if a.seqid == b.seqid: continue # Use --prefix to limit the links between seqids with the same prefix # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002 if opts.prefix: aprefix = a.seqid.split("_")[0] bprefix = b.seqid.split("_")[0] if aprefix != bprefix: continue cl = ContigLink(a, b, insert=insert, cutoff=maxcutoff) if cl.flip_innie(sizes, debug=debug): print >> fw, "\t".join((pe, str(cl)))