Example #1
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder):
        samplefq = []
        for i in range(2):
            samplefq.append(
                op.join(work, prefix + "_{0}.first.fastq".format(i + 1)))
            first([str(opts.firstN)] + [p[i]] + ["-o", samplefq[i]])

        os.chdir(work)
        align_args = [ref] + [op.basename(fq) for fq in samplefq]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""),
                                            i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Example #2
0
def pairs(args):
    """
    %prog pairs folder reference.fasta

    Estimate insert size distribution. Compatible with a variety of aligners,
    including CLC, BOWTIE and BWA.
    """
    p = OptionParser(pairs.__doc__)
    p.set_firstN()
    p.set_mates()
    p.set_aligner()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    cwd = os.getcwd()
    aligner = opts.aligner
    work = "-".join(("pairs", aligner))
    mkdir(work)

    if aligner == "clc":
        from jcvi.apps.clc import align
        from jcvi.formats.cas import pairs as ps
    else:
        from jcvi.formats.sam import pairs as ps

    if aligner == "bowtie":
        from jcvi.apps.bowtie import align
    elif aligner == "bwa":
        from jcvi.apps.bwa import align

    folder, ref = args
    ref = get_abs_path(ref)
    messages = []
    for p, prefix in iter_project(folder, 2):
        samplefq = op.join(work, prefix + ".first.fastq")
        first([str(opts.firstN)] + p + ["-o", samplefq])

        os.chdir(work)
        align_args = [ref, op.basename(samplefq)]
        outfile, logfile = align(align_args)
        bedfile, stats = ps([outfile, "--rclip={0}".format(opts.rclip)])
        os.chdir(cwd)

        median = stats.median
        tag = "MP" if median > 1000 else "PE"
        median = str(median)
        pf, sf = median[:2], median[2:]
        if sf and int(sf) != 0:
            pf = str(int(pf) + 1)  # Get the first two effective digits
        lib = "{0}-{1}".format(tag, pf + "0" * len(sf))
        for i, xp in enumerate(p):
            suffix = "fastq.gz" if xp.endswith(".gz") else "fastq"
            link = "{0}-{1}.{2}.{3}".format(lib, prefix.replace("-", ""), i + 1, suffix)
            m = "\t".join(str(x) for x in (xp, link))
            messages.append(m)

    messages = "\n".join(messages)
    write_file("f.meta", messages, tee=True)
Example #3
0
File: bed.py Project: radaniba/jcvi
def mates(args):
    """
    %prog mates bedfile

    Generate the mates file by inferring from the names.
    """
    p = OptionParser(mates.__doc__)
    p.add_option(
        "--lib",
        default=False,
        action="store_true",
        help="Output library information along with pairs [default: %default]")
    p.add_option(
        "--nointra",
        default=False,
        action="store_true",
        help="Remove mates that are intra-scaffold [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]"
    )
    p.set_mates()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    rclip = opts.rclip

    key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn)
    bed = Bed(bedfile, key=key)

    pf = bedfile.rsplit(".", 1)[0]
    matesfile = pf + ".mates"
    lib = pf if opts.lib else None
    fw = open(matesfile, "w")
    if lib:
        bedfile, stats = pairs([bedfile, \
                        "--rclip={0}".format(rclip),
                        "--cutoff={0}".format(opts.cutoff)])
        sv = int(2 * stats.sd)
        mindist = max(stats.mean - sv, 1)
        maxdist = stats.mean + sv
        print >> fw, "\t".join(str(x) for x in \
                ("library", pf, mindist, maxdist))

    num_fragments = num_pairs = 0
    matesbedfile = matesfile + ".bed"
    fwm = open(matesbedfile, "w")
    for pe, lines in groupby(bed, key=key):
        lines = list(lines)
        if len(lines) != 2:
            num_fragments += len(lines)
            continue

        a, b = lines

        if opts.nointra and a.seqid == b.seqid:
            continue

        # Use --prefix to limit the links between seqids with the same prefix
        # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002
        if opts.prefix:
            aprefix = a.seqid.split("_")[0]
            bprefix = b.seqid.split("_")[0]
            if aprefix != bprefix:
                continue

        num_pairs += 1
        pair = [a.accn, b.accn]
        if lib:
            pair.append(lib)
        print >> fw, "\t".join(pair)

        print >> fwm, a
        print >> fwm, b

    logging.debug("Discard {0} frags and write {1} pairs to `{2}` and `{3}`.".\
            format(num_fragments, num_pairs, matesfile, matesbedfile))

    fw.close()
    fwm.close()

    return matesfile, matesbedfile
Example #4
0
File: bed.py Project: yangjl/jcvi
def mates(args):
    """
    %prog mates bedfile

    Generate the mates file by inferring from the names.
    """
    p = OptionParser(mates.__doc__)
    p.add_option("--lib", default=False, action="store_true",
            help="Output library information along with pairs [default: %default]")
    p.add_option("--nointra", default=False, action="store_true",
            help="Remove mates that are intra-scaffold [default: %default]")
    p.add_option("--prefix", default=False, action="store_true",
            help="Only keep links between IDs with same prefix [default: %default]")
    p.set_mates()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    rclip = opts.rclip

    key = (lambda x: x.accn[:-rclip]) if rclip else (lambda x: x.accn)
    bed = Bed(bedfile, key=key)

    pf = bedfile.rsplit(".", 1)[0]
    matesfile = pf + ".mates"
    lib = pf if opts.lib else None
    fw = open(matesfile, "w")
    if lib:
        bedfile, stats = pairs([bedfile, \
                        "--rclip={0}".format(rclip),
                        "--cutoff={0}".format(opts.cutoff)])
        sv = int(2 * stats.sd)
        mindist = max(stats.mean - sv, 1)
        maxdist = stats.mean + sv
        print >> fw, "\t".join(str(x) for x in \
                ("library", pf, mindist, maxdist))

    num_fragments = num_pairs = 0
    matesbedfile = matesfile + ".bed"
    fwm = open(matesbedfile, "w")
    for pe, lines in groupby(bed, key=key):
        lines = list(lines)
        if len(lines) != 2:
            num_fragments += len(lines)
            continue

        a, b = lines

        if opts.nointra and a.seqid == b.seqid:
            continue

        # Use --prefix to limit the links between seqids with the same prefix
        # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002
        if opts.prefix:
            aprefix = a.seqid.split("_")[0]
            bprefix = b.seqid.split("_")[0]
            if aprefix != bprefix:
                continue

        num_pairs += 1
        pair = [a.accn, b.accn]
        if lib:
            pair.append(lib)
        print >> fw, "\t".join(pair)

        print >> fwm, a
        print >> fwm, b

    logging.debug("Discard {0} frags and write {1} pairs to `{2}` and `{3}`.".\
            format(num_fragments, num_pairs, matesfile, matesbedfile))

    fw.close()
    fwm.close()

    return matesfile, matesbedfile
Example #5
0
File: sopra.py Project: rrane/jcvi
def link(args):
    """
    %prog link bedfile fastafile

    Construct contig links based on bed file. Use --prefix to limit the links
    between contigs that start with the same prefix_xxx.
    """
    p = OptionParser(link.__doc__)
    p.set_mates(rclip=1, mateorientation="+-")
    p.add_option("--insert", type="int", default=0, help="Mean insert size [default: estimate from data]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]",
    )
    p.add_option(
        "--debug",
        dest="debug",
        default=False,
        action="store_true",
        help="Print verbose info when checking mates [default: %default]",
    )
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, fastafile = args
    debug = opts.debug
    cutoff = opts.cutoff

    sizes = Sizes(fastafile)

    cutoffopt = "--cutoff={0}".format(cutoff)
    mateorientationopt = "--mateorientation={0}".format(opts.mateorientation)
    bedfile, stats = pairs([bedfile, cutoffopt, mateorientationopt, "--rclip={0}".format(opts.rclip)])

    maxcutoff = cutoff or stats.p2
    insert = opts.insert or stats.median
    logging.debug("Mate hangs must be <= {0}, --cutoff to override".format(maxcutoff))

    rs = lambda x: x.accn[:-1]

    fp = open(bedfile)
    linksfile = bedfile.rsplit(".", 1)[0] + ".links"
    fw = open(linksfile, "w")

    for a, b in pairwise(fp):
        """
        Criteria for valid contig edge
        1. for/rev do not mapping to the same scaffold (useful for linking)
        2. assuming innie (outie must be flipped first), order the contig pair
        3. calculate sequence hangs, valid hangs are smaller than insert size
        """
        a, b = BedLine(a), BedLine(b)

        if rs(a) != rs(b):
            continue
        pe = rs(a)

        # Intra-contig links
        if a.seqid == b.seqid:
            continue

        # Use --prefix to limit the links between seqids with the same prefix
        # For example, contigs of the same BAC, mth2-23j10_001, mth-23j10_002
        if opts.prefix:
            aprefix = a.seqid.split("_")[0]
            bprefix = b.seqid.split("_")[0]
            if aprefix != bprefix:
                continue

        cl = ContigLink(a, b, insert=insert, cutoff=maxcutoff)
        if cl.flip_innie(sizes, debug=debug):
            print >> fw, "\t".join((pe, str(cl)))