Ejemplo n.º 1
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Ejemplo n.º 2
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Ejemplo n.º 3
0
def venn(args):
    """
    %prog venn *.benchmark

    Display benchmark results as Venn diagram.
    """
    from matplotlib_venn import venn2

    p = OptionParser(venn.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="9x9")

    if len(args) < 1:
        sys.exit(not p.print_help())

    bcs = args
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    pad = .02
    ystart = 1
    ywidth = 1. / len(bcs)
    tags = ("Bowers", "YGOB", "Schnable")
    for bc, tag in zip(bcs, tags):
        fp = open(bc)
        data = []
        for row in fp:
            prog, pcounts, tcounts, shared = row.split()
            pcounts = int(pcounts)
            tcounts = int(tcounts)
            shared = int(shared)
            data.append((prog, pcounts, tcounts, shared))
        xstart = 0
        xwidth = 1. / len(data)
        for prog, pcounts, tcounts, shared in data:
            a, b, c = pcounts - shared, tcounts - shared, shared
            ax = fig.add_axes([xstart + pad, ystart - ywidth + pad,
                               xwidth - 2 * pad, ywidth - 2 * pad])
            venn2(subsets=(a, b, c), set_labels=(prog, tag), ax=ax)
            message = "Sn={0} Pu={1}".\
                format(percentage(shared, tcounts, precision=0, mode=-1),
                       percentage(shared, pcounts, precision=0, mode=-1))
            print(message, file=sys.stderr)
            ax.text(.5, .92, latex(message), ha="center", va="center",
                    transform=ax.transAxes, color='b')
            ax.set_axis_off()
            xstart += xwidth
        ystart -= ywidth

    panel_labels(root, ((.04, .96, "A"), (.04, .96 - ywidth, "B"),
                  (.04, .96 - 2 * ywidth, "C")))
    panel_labels(root, ((.5, .98, "A. thaliana duplicates"),
                        (.5, .98 - ywidth, "14 Yeast genomes"),
                        (.5, .98 - 2 * ywidth, "4 Grass genomes")))
    normalize_axes(root)
    savefig("venn.pdf", dpi=opts.dpi)
Ejemplo n.º 4
0
def venn(args):
    """
    %prog venn *.benchmark

    Display benchmark results as Venn diagram.
    """
    from matplotlib_venn import venn2

    p = OptionParser(venn.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="9x9")

    if len(args) < 1:
        sys.exit(not p.print_help())

    bcs = args
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    pad = .02
    ystart = 1
    ywidth = 1. / len(bcs)
    tags = ("Bowers", "YGOB", "Schnable")
    for bc, tag in zip(bcs, tags):
        fp = open(bc)
        data = []
        for row in fp:
            prog, pcounts, tcounts, shared = row.split()
            pcounts = int(pcounts)
            tcounts = int(tcounts)
            shared = int(shared)
            data.append((prog, pcounts, tcounts, shared))
        xstart = 0
        xwidth = 1. / len(data)
        for prog, pcounts, tcounts, shared in data:
            a, b, c = pcounts - shared, tcounts - shared, shared
            ax = fig.add_axes([xstart + pad, ystart - ywidth + pad,
                               xwidth - 2 * pad, ywidth - 2 * pad])
            venn2(subsets=(a, b, c), set_labels=(prog, tag), ax=ax)
            message = "Sn={0} Pu={1}".\
                format(percentage(shared, tcounts, precision=0, mode=-1),
                       percentage(shared, pcounts, precision=0, mode=-1))
            print >> sys.stderr, message
            ax.text(.5, .92, latex(message), ha="center", va="center",
                    transform=ax.transAxes, color='b')
            ax.set_axis_off()
            xstart += xwidth
        ystart -= ywidth

    panel_labels(root, ((.04, .96, "A"), (.04, .96 - ywidth, "B"),
                  (.04, .96 - 2 * ywidth, "C")))
    panel_labels(root, ((.5, .98, "A. thaliana duplicates"),
                        (.5, .98 - ywidth, "14 Yeast genomes"),
                        (.5, .98 - 2 * ywidth, "4 Grass genomes")))
    normalize_axes(root)
    savefig("venn.pdf", dpi=opts.dpi)
Ejemplo n.º 5
0
def fillstats(args):
    """
    %prog fillstats genome.fill

    Build stats on .fill file from GapCloser.
    """
    from jcvi.utils.cbook import SummaryStats, percentage, thousands

    p = OptionParser(fillstats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fillfile, ) = args
    fp = open(fillfile)
    scaffolds = 0
    gaps = []
    for row in fp:
        if row[0] == ">":
            scaffolds += 1
            continue
        fl = FillLine(row)
        gaps.append(fl)

    print("{0} scaffolds in total".format(scaffolds), file=sys.stderr)

    closed = [x for x in gaps if x.closed]
    closedbp = sum(x.before for x in closed)
    notClosed = [x for x in gaps if not x.closed]
    notClosedbp = sum(x.before for x in notClosed)

    totalgaps = len(closed) + len(notClosed)

    print(
        "Closed gaps: {0} size: {1} bp".format(
            percentage(len(closed), totalgaps), thousands(closedbp)),
        file=sys.stderr,
    )
    ss = SummaryStats([x.after for x in closed])
    print(ss, file=sys.stderr)

    ss = SummaryStats([x.delta for x in closed])
    print("Delta:", ss, file=sys.stderr)

    print(
        "Remaining gaps: {0} size: {1} bp".format(
            percentage(len(notClosed), totalgaps), thousands(notClosedbp)),
        file=sys.stderr,
    )
    ss = SummaryStats([x.after for x in notClosed])
    print(ss, file=sys.stderr)
Ejemplo n.º 6
0
 def print_stats(self):
     qrycovered = self.qrycovered
     refcovered = self.refcovered
     qryspan = self.qryspan
     refspan = self.refspan
     m0 = "AL50 (>=50% of bases in alignment blocks >= this size): {}".format(
         self.AL50
     )
     m1 = "Query coverage: {}".format(percentage(self.identicals, qrycovered))
     m2 = "Reference coverage: {}".format(percentage(self.identicals, refcovered))
     m3 = "Query span: {}".format(percentage(self.identicals, qryspan))
     m4 = "Reference span: {}".format(percentage(self.identicals, refspan))
     print("\n".join((m0, m1, m2, m3, m4)), file=sys.stderr)
Ejemplo n.º 7
0
def filter(args):
    """
    %prog filter frgfile idsfile

    Removes the reads from frgfile that are indicated as duplicates in the
    clstrfile (generated by CD-HIT-454). `idsfile` includes a set of names to
    include in the filtered frgfile. See apps.cdhit.ids().
    """
    p = OptionParser(filter.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    frgfile, idsfile = args
    assert frgfile.endswith(".frg")

    fp = open(idsfile)
    allowed = set(x.strip() for x in fp)
    logging.debug("A total of {0} allowed ids loaded.".format(len(allowed)))

    newfrgfile = frgfile.replace(".frg", ".filtered.frg")
    fp = open(frgfile)
    fw = open(newfrgfile, "w")

    nfrags, discarded_frags = 0, 0
    nmates, discarded_mates = 0, 0
    for rec in iter_records(fp):
        if rec.type == "FRG":
            readname = rec.get_field("acc")
            readname = readname.rstrip("ab")
            nfrags += 1
            if readname not in allowed:
                discarded_frags += 1
                continue
        if rec.type == "LKG":
            readname = rec.get_field("frg")
            readname = readname.rstrip("ab")
            nmates += 1
            if readname not in allowed:
                discarded_mates += 1
                continue
        print >> fw, rec

    # Print out a summary
    survived_frags = nfrags - discarded_frags
    survived_mates = nmates - discarded_mates
    print >> sys.stderr, "Survived fragments: {0}".\
            format(percentage(survived_frags, nfrags))
    print >> sys.stderr, "Survived mates: {0}".\
            format(percentage(survived_mates, nmates))
Ejemplo n.º 8
0
Archivo: bed.py Proyecto: radaniba/jcvi
def fix(args):
    """
    %prog fix bedfile > newbedfile

    Fix non-standard bed files. One typical problem is start > end.
    """
    p = OptionParser(fix.__doc__)
    p.add_option("--minspan",
                 default=0,
                 type="int",
                 help="Enforce minimum span [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    minspan = opts.minspan
    fp = open(bedfile)
    fw = must_open(opts.outfile, "w")
    nfixed = nfiltered = ntotal = 0
    for row in fp:
        atoms = row.strip().split("\t")
        assert len(atoms) >= 3, "Must be at least 3 columns"
        seqid, start, end = atoms[:3]
        start, end = int(start), int(end)
        orientation = '+'
        if start > end:
            start, end = end, start
            orientation = '-'
            nfixed += 1

        atoms[1:3] = [str(start), str(end)]
        if len(atoms) > 6:
            atoms[6] = orientation
        line = "\t".join(atoms)
        b = BedLine(line)

        if b.span >= minspan:
            print >> fw, b
            nfiltered += 1

        ntotal += 1

    if nfixed:
        logging.debug("Total fixed: {0}".format(percentage(nfixed, ntotal)))
    if nfiltered:
        logging.debug("Total filtered: {0}".format(
            percentage(nfiltered, ntotal)))
Ejemplo n.º 9
0
 def print_stats(self):
     qrycovered = self.qrycovered
     refcovered = self.refcovered
     qryspan = self.qryspan
     refspan = self.refspan
     m0 = "AL50 (>=50% of bases in alignment blocks >= this size): {}".\
             format(self.AL50)
     m1 = "Query coverage: {}".\
             format(percentage(self.identicals, qrycovered))
     m2 = "Reference coverage: {}".\
             format(percentage(self.identicals, refcovered))
     m3 = "Query span: {}".format(percentage(self.identicals, qryspan))
     m4 = "Reference span: {}".format(percentage(self.identicals, refspan))
     print("\n".join((m0, m1, m2, m3, m4)), file=sys.stderr)
Ejemplo n.º 10
0
Archivo: bed.py Proyecto: radaniba/jcvi
def filter(args):
    """
    %prog filter bedfile

    Filter the bedfile to retain records between certain size range.
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=0,
                 type="int",
                 help="Minimum feature length")
    p.add_option("--maxsize",
                 default=1000000000,
                 type="int",
                 help="Minimum feature length")
    p.add_option(
        "--minaccn",
        type="int",
        help="Minimum value of accn, useful to filter based on coverage")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    fp = must_open(bedfile)
    fw = must_open(opts.outfile, "w")
    minsize, maxsize = opts.minsize, opts.maxsize
    minaccn = opts.minaccn
    total = []
    keep = []
    for row in fp:
        b = BedLine(row)
        span = b.span
        total.append(span)
        if not minsize <= span <= maxsize:
            continue
        if minaccn and int(b.accn) < minaccn:
            continue
        print >> fw, b
        keep.append(span)

    logging.debug("Stats: {0} features kept.".\
                    format(percentage(len(keep), len(total))))
    logging.debug("Stats: {0} bases kept.".\
                    format(percentage(sum(keep), sum(total))))
Ejemplo n.º 11
0
def gc(seqs):
    gc = total = 0
    for s in seqs:
        s = s.upper()
        gc += s.count('G') + s.count('C')
        total += sum(s.count(x) for x in 'ACGT')
    return percentage(gc, total, precision=0, mode=-1)
Ejemplo n.º 12
0
def loghistogram(data, base=2, ascii=True, title="Counts", summary=False):
    """
    bins is a dictionary with key: log(x, base), value: counts.
    """
    from jcvi.utils.cbook import percentage

    if summary:
        unique = len(data)
        total = sum(data)

        # Print out a distribution
        print >> sys.stderr, "Unique: {0}".format(percentage(unique, total))

    bins = defaultdict(int)
    for d in data:
        logd = int(log(d, base))
        bins[logd] += 1

    x, y = [], []
    for size, number in sorted(bins.items()):
        lb, ub = base ** size, base ** (size + 1)
        x.append((lb, ub))
        y.append(number)

    asciiplot(x, y, title=title)
Ejemplo n.º 13
0
def gc(seqs):
    gc = total = 0
    for s in seqs:
        s = s.upper()
        gc += s.count('G') + s.count('C')
        total += sum(s.count(x) for x in 'ACGT')
    return percentage(gc, total, precision=0, mode=-1)
Ejemplo n.º 14
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option(
        "--notreds",
        default=False,
        action="store_true",
        help="Remove TREDs from the bed file",
    )
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print(r, file=newbed)
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 15
0
def uniq(args):
    """
    %prog uniq fastqfile

    Retain only first instance of duplicate reads. Duplicate is defined as
    having the same read name.
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    fw = must_open(opts.outfile, "w")
    nduplicates = nreads = 0
    seen = set()
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        name = rec.name
        if name in seen:
            nduplicates += 1
            continue
        seen.add(name)
        print >> fw, rec
    logging.debug("Removed duplicate reads: {}".\
                  format(percentage(nduplicates, nreads)))
Ejemplo n.º 16
0
def mismatches(args):
    """
    %prog mismatches blastfile

    Print out histogram of mismatches of HSPs, usually for evaluating SNP level.
    """
    from jcvi.utils.cbook import percentage
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(mismatches.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    data = []
    b = Blast(blastfile)
    for query, bline in b.iter_best_hit():
        mm = bline.nmismatch + bline.ngaps
        data.append(mm)

    nonzeros = [x for x in data if x != 0]
    title = "Polymorphic sites: {0}".\
            format(percentage(len(nonzeros), len(data)))
    stem_leaf_plot(data, 0, 20, 20, title=title)
Ejemplo n.º 17
0
def batchcn(args):
    """
    %prog batchcn workdir samples.csv

    Run CNV segmentation caller in batch mode. Scans a workdir.
    """
    p = OptionParser(batchcn.__doc__)
    p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn",
                 help="Upload cn and seg results to s3")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    workdir, samples = args
    upload = opts.upload
    store = upload + "/{}/*.seg".format(workdir)
    computed = [op.basename(x).split(".")[0] for x in glob_s3(store)]
    computed = set(computed)

    # Generate a bunch of cn commands
    fp = open(samples)
    nskipped = ntotal = 0
    cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir)
    for row in fp:
        samplekey, path = row.strip().split(",")
        ntotal += 1
        if samplekey in computed:
            nskipped += 1
            continue
        print(" ".join((cmd, samplekey, path)))

    logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
Ejemplo n.º 18
0
def suffix(args):
    """
    %prog suffix fastqfile CAG

    Filter reads based on suffix.
    """
    p = OptionParser(suffix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastqfile, sf = args
    fw = must_open(opts.outfile, "w")
    nreads = nselected = 0
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        if rec.seq.endswith(sf):
            print >> fw, rec
            nselected += 1
    logging.debug("Selected reads with suffix {0}: {1}".\
                  format(sf, percentage(nselected, nreads)))
Ejemplo n.º 19
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def batchcn(args):
    """
    %prog batchcn workdir samples.csv

    Run CNV segmentation caller in batch mode. Scans a workdir.
    """
    p = OptionParser(batchcn.__doc__)
    p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn",
                 help="Upload cn and seg results to s3")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    workdir, samples = args
    upload = opts.upload
    store = upload + "/{}/*.seg".format(workdir)
    computed = [op.basename(x).split(".")[0] for x in glob_s3(store)]
    computed = set(computed)

    # Generate a bunch of cn commands
    fp = open(samples)
    nskipped = ntotal = 0
    cmd = "python -m jcvi.variation.cnv cn --hmm --cleanup {}".format(workdir)
    for row in fp:
        samplekey, path = row.strip().split(",")
        ntotal += 1
        if samplekey in computed:
            nskipped += 1
            continue
        print " ".join((cmd, samplekey, path))

    logging.debug("Skipped: {}".format(percentage(nskipped, ntotal)))
Ejemplo n.º 20
0
def suffix(args):
    """
    %prog suffix fastqfile CAG

    Filter reads based on suffix.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(suffix.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastqfile, sf = args
    fw = must_open(opts.outfile, "w")
    nreads = nselected = 0
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        if rec.seq.endswith(sf):
            print >> fw, rec
            nselected += 1
    logging.debug("Selected reads with suffix {0}: {1}".format(sf, percentage(nselected, nreads)))
Ejemplo n.º 21
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best",
                 default=1,
                 type="int",
                 help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Ejemplo n.º 22
0
def gc(seqs):
    gc = total = 0
    for s in seqs:
        s = s.upper()
        gc += s.count("G") + s.count("C")
        total += sum(s.count(x) for x in "ACGT")
    return percentage(gc, total, precision=0, mode=-1)
Ejemplo n.º 23
0
Archivo: bed.py Proyecto: radaniba/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
Ejemplo n.º 24
0
def filterm4(args):
    """
    %prog filterm4 sample.m4 > filtered.m4

    Filter .m4 file after blasr is run. As blasr takes a long time to run,
    changing -bestn is undesirable. This screens the m4 file to retain top hits.
    """
    p = OptionParser(filterm4.__doc__)
    p.add_option("--best", default=1, type="int", help="Only retain best N hits")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    m4file, = args
    best = opts.best
    fp = open(m4file)
    fw = must_open(opts.outfile, "w")
    seen = defaultdict(int)
    retained = total = 0
    for row in fp:
        r = M4Line(row)
        total += 1
        if total % 100000 == 0:
            logging.debug("Retained {0} lines".\
                            format(percentage(retained, total)))
        if seen.get(r.query, 0) < best:
            fw.write(row)
            seen[r.query] += 1
            retained += 1
    fw.close()
Ejemplo n.º 25
0
def mismatches(args):
    """
    %prog mismatches blastfile

    Print out histogram of mismatches of HSPs, usually for evaluating SNP level.
    """
    from jcvi.utils.cbook import percentage
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(mismatches.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blastfile, = args

    data = []
    matches = 0
    b = Blast(blastfile)
    for query, bline in b.iter_best_hit():
        mm = bline.nmismatch + bline.ngaps
        data.append(mm)

    nonzeros = [x for x in data if x != 0]
    title = "Polymorphic sites: {0}".\
            format(percentage(len(nonzeros), len(data)))
    stem_leaf_plot(data, 0, 20, 20, title=title)
Ejemplo n.º 26
0
def loghistogram(data, base=2, ascii=True, title="Counts", summary=False):
    """
    bins is a dictionary with key: log(x, base), value: counts.
    """
    from jcvi.utils.cbook import percentage

    if summary:
        unique = len(data)
        total = sum(data)

        # Print out a distribution
        print >> sys.stderr, "Unique: {0}".format(percentage(unique, total))

    bins = defaultdict(int)
    for d in data:
        logd = int(log(d, base))
        bins[logd] += 1

    x, y = [], []
    for size, number in sorted(bins.items()):
        lb, ub = base ** size, base ** (size + 1)
        x.append((lb, ub))
        y.append(number)

    asciiplot(x, y, title=title)
Ejemplo n.º 27
0
Archivo: bed.py Proyecto: yangjl/jcvi
def distance(args):
    """
    %prog distance bedfile

    Calculate distance between bed features. The output file is a list of
    distances, which can be used to plot histogram, etc.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(distance.__doc__)
    p.add_option("--distmode", default="ss", choices=("ss", "ee"),
            help="Distance mode between paired reads. ss is outer distance, " \
                 "ee is inner distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    sortedbedfile = sort([bedfile])
    valid = total = 0
    fp = open(sortedbedfile)
    for a, b in pairwise(fp):
        a = BedLine(a)
        b = BedLine(b)
        ar = (a.seqid, a.start, a.end, "+")
        br = (b.seqid, b.start, b.end, "+")
        dist, oo = range_distance(ar, br, distmode=opts.distmode)
        total += 1
        if dist > 0:
            print dist
            valid += 1

    logging.debug("Total valid (> 0) distances: {0}.".\
                  format(percentage(valid, total)))
Ejemplo n.º 28
0
def uniq(args):
    """
    %prog uniq fastqfile

    Retain only first instance of duplicate reads. Duplicate is defined as
    having the same read name.
    """
    p = OptionParser(uniq.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    fw = must_open(opts.outfile, "w")
    nduplicates = nreads = 0
    seen = set()
    for rec in iter_fastq(fastqfile):
        nreads += 1
        if rec is None:
            break
        name = rec.name
        if name in seen:
            nduplicates += 1
            continue
        seen.add(name)
        print(rec, file=fw)
    logging.debug("Removed duplicate reads: {}".\
                  format(percentage(nduplicates, nreads)))
Ejemplo n.º 29
0
Archivo: bed.py Proyecto: yangjl/jcvi
def fix(args):
    """
    %prog fix bedfile > newbedfile

    Fix non-standard bed files. One typical problem is start > end.
    """
    p = OptionParser(fix.__doc__)
    p.add_option("--minspan", default=0, type="int",
                 help="Enforce minimum span [default: %default]")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    minspan = opts.minspan
    fp = open(bedfile)
    fw = must_open(opts.outfile, "w")
    nfixed = nfiltered = ntotal = 0
    for row in fp:
        atoms = row.strip().split("\t")
        assert len(atoms) >= 3, "Must be at least 3 columns"
        seqid, start, end = atoms[:3]
        start, end = int(start), int(end)
        orientation = '+'
        if start > end:
            start, end = end, start
            orientation = '-'
            nfixed += 1

        atoms[1:3] = [str(start), str(end)]
        if len(atoms) > 6:
            atoms[6] = orientation
        line = "\t".join(atoms)
        b = BedLine(line)

        if b.span >= minspan:
            print >> fw, b
            nfiltered += 1

        ntotal += 1

    if nfixed:
        logging.debug("Total fixed: {0}".format(percentage(nfixed, ntotal)))
    if nfiltered:
        logging.debug("Total filtered: {0}".format(percentage(nfiltered, ntotal)))
Ejemplo n.º 30
0
def stats(args):
    """
    %prog stats blocksfile

    Provide statistics for MCscan-style blocks. The count of homologs in each
    pivot gene is recorded.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(stats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blocksfile, = args
    fp = open(blocksfile)
    counts = defaultdict(int)
    total = orthologous = 0
    for row in fp:
        atoms = row.rstrip().split("\t")
        hits = [x for x in atoms[1:] if x != '.']
        counts[len(hits)] += 1
        total += 1
        if atoms[1] != '.':
            orthologous += 1

    print("Total lines: {0}".format(total), file=sys.stderr)
    for i, n in sorted(counts.items()):
        print("Count {0}: {1}".format(i, percentage(n, total)),
              file=sys.stderr)

    print(file=sys.stderr)

    matches = sum(n for i, n in counts.items() if i != 0)
    print("Total lines with matches: {0}".\
                format(percentage(matches, total)), file=sys.stderr)
    for i, n in sorted(counts.items()):
        if i == 0:
            continue

        print("Count {0}: {1}".format(i, percentage(n, matches)),
              file=sys.stderr)

    print(file=sys.stderr)
    print("Orthologous matches: {0}".\
                format(percentage(orthologous, matches)), file=sys.stderr)
Ejemplo n.º 31
0
def fillstats(args):
    """
    %prog fillstats genome.fill

    Build stats on .fill file from GapCloser.
    """
    from jcvi.utils.cbook import SummaryStats, percentage, thousands

    p = OptionParser(fillstats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fillfile, = args
    fp = open(fillfile)
    scaffolds = 0
    gaps = []
    for row in fp:
        if row[0] == ">":
            scaffolds += 1
            continue
        fl = FillLine(row)
        gaps.append(fl)

    print >> sys.stderr, "{0} scaffolds in total".format(scaffolds)

    closed = [x for x in gaps if x.closed]
    closedbp = sum(x.before for x in closed)
    notClosed = [x for x in gaps if not x.closed]
    notClosedbp = sum(x.before for x in notClosed)

    totalgaps = len(closed) + len(notClosed)
    totalbp = closedbp + notClosedbp

    print >> sys.stderr, "Closed gaps: {0} size: {1} bp".\
                        format(percentage(len(closed), totalgaps), thousands(closedbp))
    ss = SummaryStats([x.after for x in closed])
    print >> sys.stderr, ss

    ss = SummaryStats([x.delta for x in closed])
    print >> sys.stderr, "Delta:", ss

    print >> sys.stderr, "Remaining gaps: {0} size: {1} bp".\
                        format(percentage(len(notClosed), totalgaps), thousands(notClosedbp))
    ss = SummaryStats([x.after for x in notClosed])
    print >> sys.stderr, ss
Ejemplo n.º 32
0
def stats(args):
    """
    %prog stats blocksfile

    Provide statistics for MCscan-style blocks. The count of homologs in each
    pivot gene is recorded.
    """
    from jcvi.utils.cbook import percentage

    p = OptionParser(stats.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    blocksfile, = args
    fp = open(blocksfile)
    counts = defaultdict(int)
    total = orthologous = 0
    for row in fp:
        atoms = row.rstrip().split("\t")
        hits = [x for x in atoms[1:] if x != '.']
        counts[len(hits)] += 1
        total += 1
        if atoms[1] != '.':
            orthologous += 1

    print >> sys.stderr, "Total lines: {0}".format(total)
    for i, n in sorted(counts.items()):
        print >> sys.stderr, "Count {0}: {1}".format(i, percentage(n, total))

    print >> sys.stderr

    matches = sum(n for i, n in counts.items() if i != 0)
    print >> sys.stderr, "Total lines with matches: {0}".\
                format(percentage(matches, total))
    for i, n in sorted(counts.items()):
        if i == 0:
            continue

        print >> sys.stderr, "Count {0}: {1}".format(i, percentage(n, matches))

    print >> sys.stderr
    print >> sys.stderr, "Orthologous matches: {0}".\
                format(percentage(orthologous, matches))
Ejemplo n.º 33
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--notreds", default=False, action="store_true",
                 help="Remove TREDs from the bed file")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print >> newbed, r
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 34
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq",
                 action="store_true",
                 default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 35
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"),
                        (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum,
                                               precision=0,
                                               target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum,
                                         s.totalsize,
                                         precision=0,
                                         mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print >> sys.stderr, tabulate(r)
Ejemplo n.º 36
0
def summary(args):
    """
    %prog summary fastafile

    Report the number of bases and sequences masked.
    """
    p = OptionParser(summary.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    f = Fasta(fastafile, index=False)

    halfmaskedseqs = set()
    allmasked = 0
    allbases = 0
    cutoff = 50
    for key, seq in f.iteritems():
        masked = 0
        for base in seq:
            if base not in "AGCT":
                masked += 1
        seqlen = len(seq)
        if masked * 100.0 / seqlen > cutoff:
            halfmaskedseqs.add(key)
        allmasked += masked
        allbases += seqlen

    seqnum = len(f)
    maskedseqnum = len(halfmaskedseqs)

    print(
        "Total masked bases: {0}".format(percentage(allmasked, allbases)),
        file=sys.stderr,
    )
    print(
        "Total masked sequences (contain > {0}% masked): {1}".format(
            cutoff, percentage(maskedseqnum, seqnum)),
        file=sys.stderr,
    )
Ejemplo n.º 37
0
Archivo: bed.py Proyecto: yangjl/jcvi
def filter(args):
    """
    %prog filter bedfile

    Filter the bedfile to retain records between certain size range.
    """
    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=0, type="int",
                 help="Minimum feature length")
    p.add_option("--maxsize", default=1000000000, type="int",
                 help="Minimum feature length")
    p.add_option("--minaccn", type="int",
                 help="Minimum value of accn, useful to filter based on coverage")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bedfile, = args
    fp = must_open(bedfile)
    fw = must_open(opts.outfile, "w")
    minsize, maxsize = opts.minsize, opts.maxsize
    minaccn = opts.minaccn
    total = []
    keep = []
    for row in fp:
        b = BedLine(row)
        span = b.span
        total.append(span)
        if not minsize <= span <= maxsize:
            continue
        if minaccn and int(b.accn) < minaccn:
            continue
        print >> fw, b
        keep.append(span)

    logging.debug("Stats: {0} features kept.".\
                    format(percentage(len(keep), len(total))))
    logging.debug("Stats: {0} bases kept.".\
                    format(percentage(sum(keep), sum(total))))
Ejemplo n.º 38
0
def batchlobstr(args):
    """
    %prog batchlobstr samples.csv

    Run lobSTR sequentially on list of samples. Each line contains:
    sample-name,s3-location
    """
    p = OptionParser(batchlobstr.__doc__)
    p.add_option("--sep", default=",", help="Separator for building commandline")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samplesfile,) = args
    store = opts.output_path
    computed = ls_s3(store)
    fp = open(samplesfile)
    skipped = total = 0
    for row in fp:
        total += 1
        sample, s3file = row.strip().split(",")[:2]
        exec_id, sample_id = sample.split("_")
        bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam")

        gzfile = sample + ".{0}.vcf.gz".format("hg38")
        if gzfile in computed:
            skipped += 1
            continue

        print(
            opts.sep.join(
                "python -m jcvi.variation.str lobstr".split()
                + [
                    "hg38",
                    "--input_bam_path",
                    bamfile,
                    "--output_path",
                    store,
                    "--sample_id",
                    sample_id,
                    "--workflow_execution_id",
                    exec_id,
                    "--lobstr_home",
                    opts.lobstr_home,
                    "--workdir",
                    opts.workdir,
                ]
            )
        )
    fp.close()
    logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
Ejemplo n.º 39
0
Archivo: str.py Proyecto: Hensonmw/jcvi
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq", action="store_true", default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 40
0
    def header(self):
        from jcvi.utils.cbook import percentage

        s = "Number of paired reads: {0}\n".format(\
                percentage(self.npairs * 2, self.nreads))
        s += "Libraries: {0}\n".format(", ".join(self.libnames))
        s += "LibraryStats: {0}\n".format(self.libstats)
        s += "r1: {0}\n".format(self.r1)
        s += "r2: {0}\n".format(self.r2)
        s += "libs: {0}".format(self.libs)
        return s
Ejemplo n.º 41
0
 def export_table(self, r, mapname, total):
     r["Markers (unique)", mapname] = self.num_markers
     r["Markers per Mb", mapname] = \
             self.num_markers * 1e6 / self.total_bases \
             if self.total_bases else 0
     r["Scaffolds", mapname] = self.num_scaffolds
     r["N50 Scaffolds", mapname] = self.num_n50_scaffolds
     r["Total bases", mapname] = percentage(self.total_bases, total, mode=1)
     r["Scaffolds with 1 marker", mapname] = self.scaffold_1m
     r["Scaffolds with 2 markers", mapname] = self.scaffold_2m
     r["Scaffolds with 3 markers", mapname] = self.scaffold_3m
     r["Scaffolds with >=4 markers", mapname] = self.scaffold_4m
Ejemplo n.º 42
0
 def export_table(self, r, mapname, total):
     r["Markers (unique)", mapname] = self.num_markers
     r["Markers per Mb", mapname] = \
             self.num_markers * 1e6 / self.total_bases \
             if self.total_bases else 0
     r["Scaffolds", mapname] = self.num_scaffolds
     r["N50 Scaffolds", mapname] = self.num_n50_scaffolds
     r["Total bases", mapname] = percentage(self.total_bases, total, mode=1)
     r["Scaffolds with 1 marker", mapname] = self.scaffold_1m
     r["Scaffolds with 2 markers", mapname] = self.scaffold_2m
     r["Scaffolds with 3 markers", mapname] = self.scaffold_3m
     r["Scaffolds with >=4 markers", mapname] = self.scaffold_4m
Ejemplo n.º 43
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize",
                 default=2,
                 type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".format(
            nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug(
            "Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".
            format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".format(
        percentage(totalassembled, totalreads)))
Ejemplo n.º 44
0
def batchlobstr(args):
    """
    %prog batchlobstr samples.csv

    Run lobSTR sequentially on list of samples. Each line contains:
    sample-name,s3-location
    """
    p = OptionParser(batchlobstr.__doc__)
    p.add_option("--sep", default=",", help="Separator for building commandline")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samplesfile, = args
    store = opts.output_path
    computed = ls_s3(store)
    fp = open(samplesfile)
    skipped = total = 0
    for row in fp:
        total += 1
        sample, s3file = row.strip().split(",")[:2]
        exec_id, sample_id = sample.split("_")
        bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam")

        gzfile = sample + ".{0}.vcf.gz".format("hg38")
        if gzfile in computed:
            skipped += 1
            continue

        print opts.sep.join(
            "python -m jcvi.variation.str lobstr".split()
            + [
                "hg38",
                "--input_bam_path",
                bamfile,
                "--output_path",
                store,
                "--sample_id",
                sample_id,
                "--workflow_execution_id",
                exec_id,
                "--lobstr_home",
                opts.lobstr_home,
                "--workdir",
                opts.workdir,
            ]
        )
    fp.close()
    logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
Ejemplo n.º 45
0
Archivo: mask.py Proyecto: bennyyu/jcvi
def summary(args):
    """
    %prog summary fastafile

    Report the number of bases and sequences masked.
    """
    p = OptionParser(summary.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    f = Fasta(fastafile, index=False)

    halfmaskedseqs = set()
    allmasked = 0
    allbases = 0
    cutoff = 50
    others = 0
    for key, seq in f.iteritems():
        masked = 0
        for base in seq:
            if base not in "AGCT":
                masked += 1
        seqlen = len(seq)
        if masked * 100. / seqlen > cutoff:
            halfmaskedseqs.add(key)
        allmasked += masked
        allbases += seqlen

    seqnum = len(f)
    maskedseqnum = len(halfmaskedseqs)

    print >> sys.stderr, "Total masked bases: {0}".\
            format(percentage(allmasked, allbases))
    print >> sys.stderr, "Total masked sequences (contain > {0}% masked): {1}".\
            format(cutoff, percentage(maskedseqnum, seqnum))
Ejemplo n.º 46
0
def validate(args):
    """
    %prog validate imputed.vcf withheld.vcf

    Validate imputation against withheld variants.
    """
    p = OptionParser(validate.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    imputed, withheld = args
    register = {}
    fp = open(withheld)
    for row in fp:
        if row[0] == "#":
            continue
        v = VcfLine(row)
        register[(v.seqid, v.pos)] = v.genotype

    logging.debug("Imported {0} records from `{1}`".\
                    format(len(register), withheld))

    fp = must_open(imputed)
    hit = concordant = 0
    seen = set()
    for row in fp:
        if row[0] == "#":
            continue
        v = VcfLine(row)
        chr, pos, genotype = v.seqid, v.pos, v.genotype
        if (chr, pos) in seen:
            continue
        seen.add((chr, pos))
        if (chr, pos) not in register:
            continue
        truth = register[(chr, pos)]
        imputed = genotype.split(":")[0]
        if "|" in imputed:
            imputed = "/".join(sorted(genotype.split(":")[0].split("|")))
            #probs = [float(x) for x in genotype.split(":")[-1].split(",")]
            #imputed = max(zip(probs, ["0/0", "0/1", "1/1"]))[-1]
        hit += 1
        if truth == imputed:
            concordant += 1
        else:
            print(row.strip(), "truth={0}".format(truth), file=sys.stderr)

    logging.debug("Total concordant: {0}".\
            format(percentage(concordant, hit)))
Ejemplo n.º 47
0
def filter(args):
    """
    %prog filter *.consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=2, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastafiles = args
    minsize = opts.minsize
    totalreads = totalassembled = 0
    fw = must_open(opts.outfile, "w")
    for i, fastafile in enumerate(fastafiles):
        f = Fasta(fastafile, lazy=True)
        pf = "s{0:03d}".format(i)
        nreads = nsingletons = nclusters = 0
        for desc, rec in f.iterdescriptions_ordered():
            nclusters += 1
            if desc.startswith("singleton"):
                nsingletons += 1
                nreads += 1
                continue
            # consensus_for_cluster_0 with 63 sequences
            name, w, size, seqs = desc.split()
            assert w == "with"
            size = int(size)
            nreads += size
            if size < minsize:
                continue
            rec.description = rec.description.split(None, 1)[-1]
            rec.id = pf + "_" + rec.id
            SeqIO.write(rec, fw, "fasta")
        logging.debug("Scanned {0} clusters with {1} reads ..".\
                       format(nclusters, nreads))
        cclusters, creads = nclusters - nsingletons, nreads - nsingletons
        logging.debug("Saved {0} clusters (min={1}) with {2} reads (avg:{3}) [{4}]".\
                       format(cclusters, minsize, creads, creads / cclusters, pf))
        totalreads += nreads
        totalassembled += nreads - nsingletons
    logging.debug("Total assembled: {0}".\
                  format(percentage(totalassembled, totalreads)))
Ejemplo n.º 48
0
def summary(args):
    """
    %prog summary gffile fastafile

    Print summary stats, including:
    - Gene/Exon/Intron
    - Number
    - Average size (bp)
    - Median size (bp)
    - Total length (Mb)
    - % of genome
    - % GC
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gff_file, ref = args
    s = Fasta(ref)
    g = make_index(gff_file)
    geneseqs, exonseqs, intronseqs = [], [], []  # Calc % GC
    for f in g.features_of_type("gene"):
        fid = f.id
        fseq = s.sequence({'chr': f.chrom, 'start': f.start, 'stop': f.stop})
        geneseqs.append(fseq)
        exons = set((c.chrom, c.start, c.stop) for c in g.children(fid, 2) \
                    if c.featuretype == "exon")
        exons = list(exons)
        for chrom, start, stop in exons:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            exonseqs.append(fseq)
        introns = range_interleave(exons)
        for chrom, start, stop in introns:
            fseq = s.sequence({'chr': chrom, 'start': start, 'stop': stop})
            intronseqs.append(fseq)

    r = {}  # Report
    for t, tseqs in zip(("Gene", "Exon", "Intron"), (geneseqs, exonseqs, intronseqs)):
        tsizes = [len(x) for x in tseqs]
        tsummary = SummaryStats(tsizes, dtype="int")
        r[t, "Number"] = tsummary.size
        r[t, "Average size (bp)"] = tsummary.mean
        r[t, "Median size (bp)"] = tsummary.median
        r[t, "Total length (Mb)"] = human_size(tsummary.sum, precision=0, target="Mb")
        r[t, "% of genome"] = percentage(tsummary.sum, s.totalsize, precision=0, mode=-1)
        r[t, "% GC"] = gc(tseqs)

    print(tabulate(r), file=sys.stderr)
Ejemplo n.º 49
0
Archivo: vcf.py Proyecto: xuanblo/jcvi
def validate(args):
    """
    %prog validate input.vcf genome.fasta

    Fasta validation of vcf file.
    """
    import pyfasta

    p = OptionParser(validate.__doc__)
    p.add_option("--prefix", help="Add prefix to seqid")
    opts, args = p.parse_args(args)

    vcffile, fastafile = args
    pf = opts.prefix
    genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord)
    fp = must_open(vcffile)
    match_ref = match_alt = total = 0
    for row in fp:
        if row[0] == '#':
            continue
        seqid, pos, id, ref, alt = row.split()[:5]
        total += 1
        if pf:
            seqid = pf + seqid
        pos = int(pos)
        if seqid not in genome:
            continue
        true_ref = genome[seqid][pos - 1]
        if total % 100000 == 0:
            print >> sys.stderr, total, "sites parsed"
        if ref == true_ref:
            match_ref += 1
        elif alt == true_ref:
            match_alt += 1

    logging.debug("Match REF: {}".format(percentage(match_ref, total)))
    logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
Ejemplo n.º 50
0
Archivo: vcf.py Proyecto: zjwang6/jcvi
def validate(args):
    """
    %prog validate input.vcf genome.fasta

    Fasta validation of vcf file.
    """
    import pyfasta

    p = OptionParser(validate.__doc__)
    p.add_option("--prefix", help="Add prefix to seqid")
    opts, args = p.parse_args(args)

    vcffile, fastafile = args
    pf = opts.prefix
    genome = pyfasta.Fasta(fastafile, record_class=pyfasta.MemoryRecord)
    fp = must_open(vcffile)
    match_ref = match_alt = total = 0
    for row in fp:
        if row[0] == "#":
            continue
        seqid, pos, id, ref, alt = row.split()[:5]
        total += 1
        if pf:
            seqid = pf + seqid
        pos = int(pos)
        if seqid not in genome:
            continue
        true_ref = genome[seqid][pos - 1]
        if total % 100000 == 0:
            print(total, "sites parsed", file=sys.stderr)
        if ref == true_ref:
            match_ref += 1
        elif alt == true_ref:
            match_alt += 1

    logging.debug("Match REF: {}".format(percentage(match_ref, total)))
    logging.debug("Match ALT: {}".format(percentage(match_alt, total)))
Ejemplo n.º 51
0
def query_links(abed, bbed):
    abedlinks = abed.links
    bbedlinks = bbed.links
    # Reverse complement bbedlinks
    bxbedlinks = bbedlinks[:]
    for (a, ai), (b, bi) in bbedlinks:
        ai = {"+": "-", "?": "-", "-": "+"}[ai]
        bi = {"+": "-", "?": "-", "-": "+"}[bi]
        bxbedlinks.append(((b, bi), (a, ai)))

    atotal = len(abedlinks)
    print("Total links in {0}: {1}".format(abed.filename, atotal), file=sys.stderr)
    recovered = set(abedlinks) & set(bxbedlinks)
    print("Recovered {0}".format(percentage(len(recovered), atotal)), file=sys.stderr)
    print(set(abedlinks) - set(bxbedlinks), file=sys.stderr)
Ejemplo n.º 52
0
def mitocompile(args):
    """
    %prog mitcompile *.vcf.gz

    Extract information about deletions in vcf file.
    """
    from jcvi.formats.vcf import VcfLine
    from six.moves.urllib.parse import parse_qsl

    p = OptionParser(mitocompile.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    vcfs = args
    print("\t".join("vcf samplekey depth seqid pos alt svlen pe sr".split()))
    for i, vcf in enumerate(vcfs):
        if (i + 1) % 100 == 0:
            logging.debug("Process `{}` [{}]".format(vcf, percentage(i + 1, len(vcfs))))
        depthfile = vcf.replace(".sv.vcf.gz", ".depth")
        fp = must_open(depthfile)
        chrm, depth = fp.next().split()
        depth = int(float(depth))
        samplekey = op.basename(vcf).split("_")[0]

        fp = must_open(vcf)
        for row in fp:
            if row[0] == "#":
                continue
            v = VcfLine(row)
            info = dict(parse_qsl(v.info))
            print(
                "\t".join(
                    str(x)
                    for x in (
                        vcf,
                        samplekey,
                        depth,
                        v.seqid,
                        v.pos,
                        v.alt,
                        info.get("SVLEN"),
                        info["PE"],
                        info["SR"],
                    )
                )
            )
Ejemplo n.º 53
0
def query_links(abed, bbed):
    abedlinks = abed.links
    bbedlinks = bbed.links
    # Reverse complement bbedlinks
    bxbedlinks = bbedlinks[:]
    for (a, ai), (b, bi) in bbedlinks:
        ai = {"+": "-", "?": "-", "-": "+"}[ai]
        bi = {"+": "-", "?": "-", "-": "+"}[bi]
        bxbedlinks.append(((b, bi), (a, ai)))

    atotal = len(abedlinks)
    print >> sys.stderr, "Total links in {0}: {1}".\
                    format(abed.filename, atotal)
    recovered = set(abedlinks) & set(bxbedlinks)
    print >> sys.stderr, "Recovered {0}".\
                    format(percentage(len(recovered), atotal))
    print >> sys.stderr, set(abedlinks) - set(bxbedlinks)
Ejemplo n.º 54
0
def range_depth(ranges, size, verbose=True):
    """
    Overlay ranges on [start, end], and summarize the ploidy of the intervals.
    """
    from jcvi.utils.iter import pairwise
    from jcvi.utils.cbook import percentage

    # Make endpoints
    endpoints = []
    for a, b in ranges:
        endpoints.append((a, LEFT))
        endpoints.append((b, RIGHT))
    endpoints.sort()
    vstart, vend = min(endpoints)[0], max(endpoints)[0]

    assert 0 <= vstart < size
    assert 0 <= vend < size

    depth = 0
    depthstore = defaultdict(int)
    depthstore[depth] += vstart
    depthdetails = [(0, vstart, depth)]

    for (a, atag), (b, btag) in pairwise(endpoints):
        if atag == LEFT:
            depth += 1
        elif atag == RIGHT:
            depth -= 1
        depthstore[depth] += b - a
        depthdetails.append((a, b, depth))

    assert btag == RIGHT
    depth -= 1

    assert depth == 0
    depthstore[depth] += size - vend
    depthdetails.append((vend, size, depth))

    assert sum(depthstore.values()) == size
    if verbose:
        for depth, count in sorted(depthstore.items()):
            print >> sys.stderr, "Depth {0}: {1}".\
                    format(depth, percentage(count, size))

    return depthstore, depthdetails
Ejemplo n.º 55
0
Archivo: bed.py Proyecto: radaniba/jcvi
def some(args):
    """
    %prog some bedfile idsfile > newbedfile

    Retrieve a subset of bed features given a list of ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(some.__doc__)
    p.add_option("-v",
                 dest="inverse",
                 default=False,
                 action="store_true",
                 help="Get the inverse, like grep -v [default: %default]")
    p.set_outfile()
    p.set_stripnames()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, idsfile = args
    inverse = opts.inverse
    ostrip = opts.strip_names
    fw = must_open(opts.outfile, "w")

    ids = SetFile(idsfile)
    if ostrip:
        ids = set(gene_name(x) for x in ids)
    bed = Bed(bedfile)
    ntotal = nkeep = 0
    for b in bed:
        ntotal += 1
        keep = b.accn in ids
        if inverse:
            keep = not keep

        if keep:
            nkeep += 1
            print >> fw, b

    fw.close()
    logging.debug("Stats: {0} features kept.".\
                    format(percentage(nkeep, ntotal)))
Ejemplo n.º 56
0
Archivo: bed.py Proyecto: yangjl/jcvi
def some(args):
    """
    %prog some bedfile idsfile > newbedfile

    Retrieve a subset of bed features given a list of ids.
    """
    from jcvi.formats.base import SetFile
    from jcvi.utils.cbook import gene_name

    p = OptionParser(some.__doc__)
    p.add_option("-v", dest="inverse", default=False, action="store_true",
                 help="Get the inverse, like grep -v [default: %default]")
    p.set_outfile()
    p.set_stripnames()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bedfile, idsfile = args
    inverse = opts.inverse
    ostrip = opts.strip_names
    fw = must_open(opts.outfile, "w")

    ids = SetFile(idsfile)
    if ostrip:
        ids = set(gene_name(x) for x in ids)
    bed = Bed(bedfile)
    ntotal = nkeep = 0
    for b in bed:
        ntotal += 1
        keep = b.accn in ids
        if inverse:
            keep = not keep

        if keep:
            nkeep += 1
            print >> fw, b

    fw.close()
    logging.debug("Stats: {0} features kept.".\
                    format(percentage(nkeep, ntotal)))
Ejemplo n.º 57
0
def gaps(args):
    """
    %prog gaps idsfile fractionationfile gapsbed

    Check gene locations against gaps. `idsfile` contains a list of IDs to query
    into `fractionationfile` in order to get expected locations.
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.base import popen
    from jcvi.utils.cbook import percentage

    p = OptionParser(gaps.__doc__)
    p.add_option("--bdist", default=0, type="int",
                 help="Base pair distance [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idsfile, frfile, gapsbed = args
    bdist = opts.bdist
    d =  DictFile(frfile, keypos=1, valuepos=2)
    bedfile = idsfile + ".bed"
    fw = open(bedfile, "w")
    fp = open(idsfile)
    total = 0
    for row in fp:
        id = row.strip()
        hit = d[id]
        tag, pos = get_tag(hit, None)
        seqid, start, end = pos
        start, end = max(start - bdist, 1), end + bdist
        print >> fw, "\t".join(str(x) for x in (seqid, start - 1, end, id))
        total += 1
    fw.close()

    cmd = "intersectBed -a {0} -b {1} -v | wc -l".format(bedfile, gapsbed)
    not_in_gaps = popen(cmd).read()
    not_in_gaps = int(not_in_gaps)
    in_gaps = total - not_in_gaps
    print >> sys.stderr, "Ids in gaps: {1}".\
            format(total, percentage(in_gaps, total))