Example #1
0
def epoch(args):
    """
    %prog epoch

    Illustrate the methods used in Maggie's epoch paper, in particular, how to
    classifiy S/G/F/FB/FN for the genes.
    """
    p = OptionParser(__doc__)
    opts, args = p.parse_args()

    fig = plt.figure(1, (6, 4))
    root = fig.add_axes([0, 0, 1, 1])

    # Separators
    linestyle = dict(lw=2, color="b", alpha=.2, zorder=2)
    root.plot((0, 1), (.5, .5), "--", **linestyle)
    for i in (1./3, 2./3):
        root.plot((i, i), (.5, 1), "--", **linestyle)
    for i in (1./6, 3./6, 5./6):
        root.plot((i, i), (0, .5), "--", **linestyle)

    # Diagrams
    plot_diagram(root, 1./6, 3./4, "S", "syntenic")
    plot_diagram(root, 3./6, 3./4, "F", "missing, with both flankers")
    plot_diagram(root, 5./6, 3./4, "G", "missing, with one flanker")
    plot_diagram(root, 2./6, 1./4, "FB", "has non-coding matches")
    plot_diagram(root, 4./6, 1./4, "FN", "syntenic region has gap")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = fname() + ".pdf"
    savefig(figname, dpi=300)
Example #2
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Example #3
0
File: age.py Project: xuanblo/jcvi
def traits(args):
    """
    %prog traits directory

    Make HTML page that reports eye and skin color.
    """
    p = OptionParser(traits.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    samples = []
    for folder in args:
        targets = iglob(folder, "*-traits.json")
        if not targets:
            continue
        filename = targets[0]
        js = json.load(open(filename))
        js["skin_rgb"] = make_rgb(
            js["traits"]["skin-color"]["L"],
            js["traits"]["skin-color"]["A"],
            js["traits"]["skin-color"]["B"])
        js["eye_rgb"] = make_rgb(
            js["traits"]["eye-color"]["L"],
            js["traits"]["eye-color"]["A"],
            js["traits"]["eye-color"]["B"])
        samples.append(js)

    template = Template(traits_template)
    fw = open("report.html", "w")
    print >> fw, template.render(samples=samples)
    logging.debug("Report written to `{}`".format(fw.name))
    fw.close()
Example #4
0
def bed(args):
    """
    %prog bed genes.ids

    Get gene bed from phytozome. `genes.ids` contains the list of gene you want
    to pull from Phytozome. Write output to .bed file.
    """
    p = OptionParser(bed.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    idsfile, = args
    ids = set(x.strip() for x in open(idsfile))
    data = get_bed_from_phytozome(list(ids))

    pf = idsfile.rsplit(".", 1)[0]
    bedfile = pf + ".bed"
    fw = open(bedfile, "w")
    for i, row in enumerate(data):
        row = row.strip()
        if row == "":
            continue

        print(row, file=fw)

    logging.debug("A total of {0} records written to `{1}`.".format(i + 1, bedfile))
Example #5
0
File: age.py Project: xuanblo/jcvi
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Example #6
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
Example #7
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Example #8
0
def summary(args):
    """
    %prog summary *.gff

    Print gene statistics table.
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    for metric in metrics:
        logging.debug("Parsing files in `{0}`..".format(metric))

        table = {}
        for x in gff_files:
            pf = op.basename(x).split(".")[0]
            numberfile = op.join(metric, pf + ".txt")
            ar = [int(x.strip()) for x in open(numberfile)]
            sum = SummaryStats(ar).todict().items()
            keys, vals = zip(*sum)
            keys = [(pf, x) for x in keys]
            table.update(dict(zip(keys, vals)))

        print >> sys.stderr, tabulate(table)
Example #9
0
def histogram(args):
    """
    %prog histogram *.gff

    Plot gene statistics based on output of stats. For each gff file, look to
    see if the metrics folder (i.e. Exon_Length) contains the data and plot
    them.
    """
    from jcvi.graphics.histogram import histogram_multiple

    p = OptionParser(histogram.__doc__)
    p.add_option("--bins", dest="bins", default=40, type="int",
            help="number of bins to plot in the histogram [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count")
    colors = ("red", "green", "blue", "black")
    vmaxes = (1000, 1000, 4000, 20)
    xlabels = ("bp", "bp", "bp", "number")
    for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels):
        logging.debug("Parsing files in `{0}`..".format(metric))
        numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \
                        for x in gff_files]

        histogram_multiple(numberfiles, 0, vmax, xlabel, metric,
                       bins=opts.bins, facet=True, fill=color,
                       prefix=metric + ".")
Example #10
0
File: ca.py Project: arvin580/jcvi
def unitigs(args):
    """
    %prog unitigs best.edges

    Reads Celera Assembler's "best.edges" and extract all unitigs.
    """
    p = OptionParser(unitigs.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    G = read_graph(bestedges, maxerr=opts.maxerr, directed=True)
    H = nx.Graph()
    intconv = lambda x: int(x.split("-")[0])
    for k, v in G.iteritems():
        if k == G.get(v, None):
            H.add_edge(intconv(k), intconv(v))

    nunitigs = nreads = 0
    for h in nx.connected_component_subgraphs(H, copy=False):
        st = [x for x in h if h.degree(x) == 1]
        if len(st) != 2:
            continue
        src, target = st
        path = list(nx.all_simple_paths(h, src, target))
        assert len(path) == 1
        path, = path
        print "|".join(str(x) for x in path)
        nunitigs += 1
        nreads += len(path)
    logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
Example #11
0
File: ca.py Project: arvin580/jcvi
def tracedb(args):
    """
    %prog tracedb <xml|lib|frg>

    Run `tracedb-to-frg.pl` within current folder.
    """
    p = OptionParser(tracedb.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    action, = args
    assert action in ("xml", "lib", "frg")

    CMD = "tracedb-to-frg.pl"
    xmls = glob("xml*")

    if action == "xml":
        for xml in xmls:
            cmd = CMD + " -xml {0}".format(xml)
            sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True)

    elif action == "lib":
        cmd = CMD + " -lib {0}".format(" ".join(xmls))
        sh(cmd)

    elif action == "frg":
        for xml in xmls:
            cmd = CMD + " -frg {0}".format(xml)
            sh(cmd, background=True)
Example #12
0
def ids(args):
    """
    %prog ids cdhit.clstr

    Get the representative ids from clstr file.
    """
    p = OptionParser(ids.__doc__)
    p.add_option("--prefix", type="int",
                 help="Find rep id for prefix of len [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    cf = ClstrFile(clstrfile)
    prefix = opts.prefix
    if prefix:
        reads = list(cf.iter_reps_prefix(prefix=prefix))
    else:
        reads = list(cf.iter_reps())

    nreads = len(reads)
    idsfile = clstrfile.replace(".clstr", ".ids")
    fw = open(idsfile, "w")
    for i, name in reads:
        print("\t".join(str(x) for x in (i, name)), file=fw)

    logging.debug("A total of {0} unique reads written to `{1}`.".\
            format(nreads, idsfile))
    fw.close()

    return idsfile
Example #13
0
def csv(args):
    """
    %prog csv excelfile

    Convert EXCEL to csv file.
    """
    from xlrd import open_workbook

    p = OptionParser(csv.__doc__)
    p.set_sep(sep=',')
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    excelfile, = args
    sep = opts.sep
    csvfile = excelfile.rsplit(".", 1)[0] + ".csv"
    wb = open_workbook(excelfile)
    fw = open(csvfile, "w")
    for s in wb.sheets():
        print >> sys.stderr, 'Sheet:',s.name
        for row in range(s.nrows):
            values = []
            for col in range(s.ncols):
                values.append(s.cell(row, col).value)
            print >> fw, sep.join(str(x) for x in values)
Example #14
0
def passthrough(args):
    """
    %prog passthrough chrY.vcf chrY.new.vcf

    Pass through Y and MT vcf.
    """
    p = OptionParser(passthrough.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, newvcffile = args
    fp = open(vcffile)
    fw = open(newvcffile, "w")
    gg = ["0/0", "0/1", "1/1"]
    for row in fp:
        if row[0] == "#":
            print(row.strip(), file=fw)
            continue

        v = VcfLine(row)
        v.filter = "PASS"
        v.format = "GT:GP"
        probs = [0] * 3
        probs[gg.index(v.genotype)] = 1
        v.genotype = v.genotype.replace("/", "|") + \
                ":{0}".format(",".join("{0:.3f}".format(x) for x in probs))
        print(v, file=fw)
    fw.close()
Example #15
0
def agp(args):
    """
    %prog agp <fastafile|sizesfile>

    Convert the sizes file to a trivial AGP file.
    """
    from jcvi.formats.agp import OO

    p = OptionParser(agp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    sizesfile, = args
    sizes = Sizes(sizesfile)
    agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp"
    fw = open(agpfile, "w")
    o = OO()  # Without a filename
    for ctg, size in sizes.iter_sizes():
        o.add(ctg, ctg, size)

    o.write_AGP(fw)
    fw.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))

    return agpfile
Example #16
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Example #17
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Example #18
0
File: vcf.py Project: Hensonmw/jcvi
def fromimpute2(args):
    """
    %prog fromimpute2 impute2file fastafile 1

    Convert impute2 output to vcf file. Imputed file looks like:

    --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002
    """
    p = OptionParser(fromimpute2.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    impute2file, fastafile, chr = args
    fasta = Fasta(fastafile)
    print get_vcfstanza(fastafile, fasta)
    fp = open(impute2file)
    seen = set()
    for row in fp:
        snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split()
        pos = int(pos)
        if pos in seen:
            continue
        seen.add(pos)
        code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1]
        tag = "PR" if snp_id == chr else "IM"
        print "\t".join(str(x) for x in \
                (chr, pos, rsid, ref, alt, ".", ".", tag, \
                "GT:GP", code + ":" + ",".join((aa, ab, bb))))
Example #19
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Example #20
0
File: vcf.py Project: Hensonmw/jcvi
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print row.strip()
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print vv[0]
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print bestv
Example #21
0
File: vcf.py Project: Hensonmw/jcvi
def sample(args):
    """
    %prog sample vcffile 0.9

    Sample subset of vcf file.
    """
    from random import random

    p = OptionParser(sample.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, ratio = args
    ratio = float(ratio)
    fp = open(vcffile)
    pf = vcffile.rsplit(".", 1)[0]
    kept = pf + ".kept.vcf"
    withheld = pf + ".withheld.vcf"
    fwk = open(kept, "w")
    fww = open(withheld, "w")
    nkept = nwithheld = 0
    for row in fp:
        if row[0] == '#':
            print >> fwk, row.strip()
            continue
        if random() < ratio:
            nkept += 1
            print >> fwk, row.strip()
        else:
            nwithheld += 1
            print >> fww, row.strip()
    logging.debug("{0} records kept to `{1}`".format(nkept, kept))
    logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
Example #22
0
def fromagp(args):
    """
    %prog fromagp agpfile componentfasta objectfasta

    Generate chain file from AGP format. The components represent the old
    genome (target) and the objects represent new genome (query).
    """
    from jcvi.formats.agp import AGP
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fromagp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, componentfasta, objectfasta = args
    chainfile = agpfile.rsplit(".", 1)[0] + ".chain"
    fw = open(chainfile, "w")
    agp = AGP(agpfile)
    componentsizes = Sizes(componentfasta).mapping
    objectsizes = Sizes(objectfasta).mapping
    chain = "chain"
    score = 1000
    tStrand = "+"
    id = 0
    for a in agp:
        if a.is_gap:
            continue

        tName = a.component_id
        tSize = componentsizes[tName]
        tStart = a.component_beg
        tEnd = a.component_end
        tStart -= 1

        qName = a.object
        qSize = objectsizes[qName]
        qStrand = "-" if a.orientation == "-" else "+"
        qStart = a.object_beg
        qEnd = a.object_end
        if qStrand == '-':
            _qStart = qSize - qEnd + 1
            _qEnd = qSize - qStart + 1
            qStart, qEnd = _qStart, _qEnd
        qStart -= 1

        id += 1
        size = a.object_span
        headerline = "\t".join(str(x) for x in (
             chain, score, tName, tSize, tStrand, tStart,
             tEnd, qName, qSize, qStrand, qStart, qEnd, id
        ))
        alignmentline = size
        print >> fw, headerline
        print >> fw, alignmentline
        print >> fw

    fw.close()
    logging.debug("File written to `{0}`.".format(chainfile))
Example #23
0
def genestatus(args):
    """
    %prog genestatus diploid.gff3.exon.ids

    Tag genes based on translation from GMAP models, using fasta.translate()
    --ids.
    """
    p = OptionParser(genestatus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    idsfile, = args
    data = get_tags(idsfile)
    key = lambda x: x[0].split(".")[0]
    for gene, cc in groupby(data, key=key):
        cc = list(cc)
        tags = [x[-1] for x in cc]
        if "complete" in tags:
            tag = "complete"
        elif "partial" in tags:
            tag = "partial"
        else:
            tag = "pseudogene"
        print "\t".join((gene, tag))
Example #24
0
def pasteprepare(args):
    """
    %prog pasteprepare bacs.fasta

    Prepare sequences for paste.
    """
    p = OptionParser(pasteprepare.__doc__)
    p.add_option("--flank", default=5000, type="int",
                 help="Get the seq of size on two ends [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    goodfasta, = args
    flank = opts.flank
    pf = goodfasta.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"

    sizes = Sizes(goodfasta)
    fw = open(extbed, "w")
    for bac, size in sizes.iter_sizes():
        print >> fw, "\t".join(str(x) for x in \
                               (bac, 0, min(flank, size), bac + "L"))
        print >> fw, "\t".join(str(x) for x in \
                               (bac, max(size - flank, 0), size, bac + "R"))
    fw.close()

    fastaFromBed(extbed, goodfasta, name=True)
Example #25
0
def spades(args):
    """
    %prog spades folder

    Run automated SPADES.
    """
    from jcvi.formats.fastq import readlen

    p = OptionParser(spades.__doc__)
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    folder, = args
    for p, pf in iter_project(folder, 2):
        rl = readlen([p[0], "--silent"])

        # <http://spades.bioinf.spbau.ru/release3.1.0/manual.html#sec3.4>
        kmers = None
        if rl >= 150:
            kmers = "21,33,55,77"
        elif rl >= 250:
            kmers = "21,33,55,77,99,127"

        cmd = "spades.py"
        if kmers:
            cmd += " -k {0}".format(kmers)
        cmd += " --careful"
        cmd += " --pe1-1 {0} --pe1-2 {1}".format(*p)
        cmd += " -o {0}_spades".format(pf)
        print cmd
Example #26
0
def mergecsv(args):
    """
    %prog mergecsv *.csv

    Combine CSV into binary array.
    """
    p = OptionParser(mergecsv.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    csvfiles = args
    arrays = []
    samplekeys = []
    for csvfile in csvfiles:
        samplekey = op.basename(csvfile).split(".")[0]
        a = np.fromfile(csvfile, sep=",", dtype=np.int32)
        x1 = a[::2]
        x2 = a[1::2]
        a = x1 * 1000 + x2
        a[a < 0] = -1
        arrays.append(a)
        samplekeys.append(samplekey)
        print >> sys.stderr, samplekey, a
    print >> sys.stderr, "Merging"
    b = np.concatenate(arrays)
    b.tofile("data.bin")

    fw = open("samples", "w")
    print >> fw, "\n".join(samplekeys)
    fw.close()
Example #27
0
def merge(args):
    """
    %prog merge folder1 ...

    Consolidate split contents in the folders. The folders can be generated by
    the split() process and several samples may be in separate fastq files. This
    program merges them.
    """
    p = OptionParser(merge.__doc__)
    p.set_outdir(outdir="outdir")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    folders = args
    outdir = opts.outdir
    mkdir(outdir)

    files = flatten(glob("{0}/*.*.fastq".format(x)) for x in folders)
    files = list(files)
    key = lambda x: op.basename(x).split(".")[0]
    files.sort(key=key)
    for id, fns in groupby(files, key=key):
        fns = list(fns)
        outfile = op.join(outdir, "{0}.fastq".format(id))
        FileMerger(fns, outfile=outfile).merge(checkexists=True)
Example #28
0
def diff(args):
    """
    %prog diff simplefile

    Calculate difference of pairwise syntenic regions.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(diff.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    simplefile, = args
    fp = open(simplefile)
    data = [x.split() for x in fp]
    spans = []
    for block_id, ab in groupby(data[1:], key=lambda x: x[0]):
        a, b = list(ab)
        aspan, bspan = a[4], b[4]
        aspan, bspan = int(aspan), int(bspan)
        spans.append((aspan, bspan))
    aspans, bspans = zip(*spans)
    dspans = [b - a for a, b, in spans]
    s = SummaryStats(dspans)
    print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans))
    print >> sys.stderr, "Sum of A: {0}".format(sum(aspans))
    print >> sys.stderr, "Sum of B: {0}".format(sum(bspans))
    print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
Example #29
0
def liftover(args):
    """
    %prog liftover lobstr_v3.0.2_hg38_ref.bed hg38.upper.fa

    LiftOver CODIS/Y-STR markers.
    """
    p = OptionParser(liftover.__doc__)
    p.add_option("--checkvalid", default=False, action="store_true",
                help="Check minscore, period and length")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    refbed, fastafile = args
    genome = pyfasta.Fasta(fastafile)
    edits = []
    fp = open(refbed)
    for i, row in enumerate(fp):
        s = STRLine(row)
        seq = genome[s.seqid][s.start - 1: s.end].upper()
        s.motif = get_motif(seq, len(s.motif))
        s.fix_counts(seq)
        if opts.checkvalid and not s.is_valid():
            continue
        edits.append(s)
        if i % 10000 == 0:
            print >> sys.stderr, i, "lines read"

    edits = natsorted(edits, key=lambda x: (x.seqid, x.start))
    for e in edits:
        print str(e)
Example #30
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Example #31
0
def run(args):
    """
    %prog run command ::: file1 file2

    Parallelize a set of commands on grid. The syntax is modeled after GNU
    parallel <http://www.gnu.org/s/parallel/man.html#options>

    {}   - input line
    {.}  - input line without extension
    {_}  - input line first part
    {/}  - basename of input line
    {/.} - basename of input line without extension
    {/_} - basename of input line first part
    {#}  - sequence number of job to run
    :::  - Use arguments from the command line as input source instead of stdin
    (standard input).

    If file name is `t/example.tar.gz`, then,
    {} is "t/example.tar.gz", {.} is "t/example.tar", {_} is "t/example"
    {/} is "example.tar.gz", {/.} is "example.tar", {/_} is "example"

    A few examples:
    ls -1 *.fastq | %prog run process {} {.}.pdf  # use stdin
    %prog run process {} {.}.pdf ::: *fastq  # use :::
    %prog run "zcat {} > {.}" ::: *.gz  # quote redirection
    %prog run < commands.list  # run a list of commands
    """
    p = OptionParser(run.__doc__)
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) == 0:
        sys.exit(not p.print_help())

    sep = ":::"
    if sep in args:
        sepidx = args.index(sep)
        filenames = args[sepidx + 1:]
        args = args[:sepidx]
        if not filenames:
            filenames = [""]
    else:
        filenames = sys.stdin if not sys.stdin.isatty() else [""]

    cmd = " ".join(args)

    cmds = [] if filenames else [(cmd, None)]
    for i, filename in enumerate(filenames):
        filename = filename.strip()
        noextname = filename.rsplit(".", 1)[0]
        prefix, basename = op.split(filename)
        basenoextname = basename.rsplit(".", 1)[0]
        basefirstname = basename.split(".")[0]
        firstname = op.join(prefix, basefirstname)
        ncmd = cmd

        if "{" in ncmd:
            ncmd = ncmd.replace("{}", filename)
        else:
            ncmd += " " + filename

        ncmd = ncmd.replace("{.}", noextname)
        ncmd = ncmd.replace("{_}", firstname)
        ncmd = ncmd.replace("{/}", basename)
        ncmd = ncmd.replace("{/.}", basenoextname)
        ncmd = ncmd.replace("{/_}", basefirstname)
        ncmd = ncmd.replace("{#}", str(i))

        outfile = None
        if ">" in ncmd:
            ncmd, outfile = ncmd.split(">", 1)
            ncmd, outfile = ncmd.strip(), outfile.strip()

        ncmd = ncmd.strip()
        cmds.append((ncmd, outfile))

    for ncmd, outfile in cmds:
        p = GridProcess(ncmd, outfile=outfile, grid_opts=opts)
        p.start()
Example #32
0
def optimize(args):
    """
    %prog optimize test.clm

    Optimize the contig order and orientation, based on CLM file.
    """
    p = OptionParser(optimize.__doc__)
    p.add_option("--skiprecover",
                 default=False,
                 action="store_true",
                 help="Do not import 'recover' contigs")
    p.add_option("--startover",
                 default=False,
                 action="store_true",
                 help="Do not resume from existing tour file")
    p.add_option("--skipGA",
                 default=False,
                 action="store_true",
                 help="Skip GA step")
    p.set_outfile(outfile=None)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clmfile, = args
    startover = opts.startover
    runGA = not opts.skipGA
    cpus = opts.cpus

    # Load contact map
    clm = CLMFile(clmfile, skiprecover=opts.skiprecover)

    tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour"
    if startover:
        tourfile = None
    tour = clm.activate(tourfile=tourfile)

    fwtour = open(tourfile, "w")
    # Store INIT tour
    print_tour(fwtour,
               clm.tour,
               "INIT",
               clm.active_contigs,
               clm.oo,
               signs=clm.signs)

    if runGA:
        for phase in range(1, 3):
            tour = optimize_ordering(fwtour, clm, phase, cpus)
            tour = clm.prune_tour(tour, cpus)

    # Flip orientations
    phase = 1
    while True:
        tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus)
        if tag1 == REJECT and tag2 == REJECT:
            logging.debug("Terminating ... no more {}".format(ACCEPT))
            break
        phase += 1

    fwtour.close()
Example #33
0
def simulate(args):
    """
    %prog simulate test

    Simulate CLM and IDS files with given names.

    The simulator assumes several distributions:
    - Links are distributed uniformly across genome
    - Log10(link_size) are distributed normally
    - Genes are distributed uniformly
    """
    p = OptionParser(simulate.__doc__)
    p.add_option("--genomesize",
                 default=10000000,
                 type="int",
                 help="Genome size")
    p.add_option("--genes", default=1000, type="int", help="Number of genes")
    p.add_option("--contigs",
                 default=100,
                 type="int",
                 help="Number of contigs")
    p.add_option("--coverage", default=10, type="int", help="Link coverage")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    pf, = args
    GenomeSize = opts.genomesize
    Genes = opts.genes
    Contigs = opts.contigs
    Coverage = opts.coverage
    PE = 500
    Links = int(GenomeSize * Coverage / PE)

    # Simulate the contig sizes that sum to GenomeSize
    # See also:
    # <https://en.wikipedia.org/wiki/User:Skinnerd/Simplex_Point_Picking>
    ContigSizes, = np.random.dirichlet([1] * Contigs, 1) * GenomeSize
    ContigSizes = np.array(np.round_(ContigSizes, decimals=0), dtype=int)
    ContigStarts = np.zeros(Contigs, dtype=int)
    ContigStarts[1:] = np.cumsum(ContigSizes)[:-1]

    # Write IDS file
    idsfile = pf + ".ids"
    fw = open(idsfile, "w")
    for i, s in enumerate(ContigSizes):
        print >> fw, "tig{:04d}\t{}".format(i, s)
    fw.close()

    # Simulate the gene positions
    GenePositions = np.sort(
        np.random.random_integers(0, GenomeSize - 1, size=Genes))
    write_last_and_beds(pf, GenePositions, ContigStarts)

    # Simulate links, uniform start, with link distances following 1/x, where x
    # is the distance between the links. As an approximation, we have link sizes
    # between [1e3, 1e7], so we map from uniform [1e-7, 1e-3]
    LinkStarts = np.sort(
        np.random.random_integers(0, GenomeSize - 1, size=Links))
    a, b = 1e-7, 1e-3
    LinkSizes = np.array(np.round_(1 / ((b - a) * np.random.rand(Links) + a),
                                   decimals=0),
                         dtype="int")
    LinkEnds = LinkStarts + LinkSizes

    # Find link to contig membership
    LinkStartContigs = np.searchsorted(ContigStarts, LinkStarts) - 1
    LinkEndContigs = np.searchsorted(ContigStarts, LinkEnds) - 1

    # Extract inter-contig links
    InterContigLinks = (LinkStartContigs != LinkEndContigs) & \
                       (LinkEndContigs != Contigs)
    ICLinkStartContigs = LinkStartContigs[InterContigLinks]
    ICLinkEndContigs = LinkEndContigs[InterContigLinks]
    ICLinkStarts = LinkStarts[InterContigLinks]
    ICLinkEnds = LinkEnds[InterContigLinks]

    # Write CLM file
    write_clm(pf, ICLinkStartContigs, ICLinkEndContigs, ICLinkStarts,
              ICLinkEnds, ContigStarts, ContigSizes)
Example #34
0
def score(args):
    """
    %prog score main_results/ cached_data/ contigsfasta

    Score the current LACHESIS CLM.
    """
    p = OptionParser(score.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    mdir, cdir, contigsfasta = args
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
    sizes = Sizes(contigsfasta)
    contig_names = list(sizes.iter_names())
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))

    oo = []
    # Load contact matrix
    glm = op.join(cdir, "all.GLM")
    N = len(contig_ids)
    M = np.zeros((N, N), dtype=int)
    fp = open(glm)
    for row in fp:
        if row[0] == '#':
            continue
        x, y, z = row.split()
        if x == 'X':
            continue
        M[int(x), int(y)] = int(z)

    fwtour = open("tour", "w")

    def callback(tour, gen, oo):
        fitness = tour.fitness if hasattr(tour, "fitness") else None
        label = "GA-{0}".format(gen)
        if fitness:
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
            label += "-" + fitness
        print_tour(fwtour, tour, label, contig_names, oo)
        return tour

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        for x in co:
            contig_id = contig_ids[x.contig_name]
            oo.append(contig_id)
        pf = op.basename(ofile).split(".")[0]
        print pf
        print oo

        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
        # Store INIT tour
        print_tour(fwtour, tour, "INIT", contig_names, oo)

        # Faster Cython version for evaluation
        from .chic import score_evaluate_M
        callbacki = partial(callback, oo=oo)
        toolbox = GA_setup(tour)
        toolbox.register("evaluate",
                         score_evaluate_M,
                         tour_sizes=tour_sizes,
                         tour_M=tour_M)
        tour, tour.fitness = GA_run(toolbox,
                                    npop=100,
                                    cpus=opts.cpus,
                                    callback=callbacki)
        print tour, tour.fitness
        break

    fwtour.close()
Example #35
0
def main():
    p = OptionParser(__doc__)
    p.add_option(
        "--switch",
        help="Rename the seqid with two-column file [default: %default]")
    p.add_option(
        "--tree",
        help="Display trees on the bottom of the figure [default: %default]")
    p.add_option("--extra", help="Extra features in BED format")
    p.add_option("--scalebar",
                 default=False,
                 action="store_true",
                 help="Add scale bar to the plot")
    opts, args, iopts = p.set_image_options(figsize="8x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    datafile, bedfile, layoutfile = args
    switch = opts.switch
    tree = opts.tree

    pf = datafile.rsplit(".", 1)[0]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    Synteny(fig,
            root,
            datafile,
            bedfile,
            layoutfile,
            switch=switch,
            tree=tree,
            extra_features=opts.extra,
            scalebar=opts.scalebar)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #36
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG
    If coord-sorted BAM is provided, then it will use it as starting point

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired",
                 default=False,
                 action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(
            genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
            cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Example #37
0
def embed(args):
    """
    %prog embed evidencefile scaffolds.fasta contigs.fasta

    Use SSPACE evidencefile to scaffold contigs into existing scaffold
    structure, as in `scaffolds.fasta`. Contigs.fasta were used by SSPACE
    directly to scaffold.

    Rules:
    1. Only update existing structure by embedding contigs small enough to fit.
    2. Promote singleton contigs only if they are big (>= min_length).
    """
    p = OptionParser(embed.__doc__)
    p.set_mingap(default=10)
    p.add_option("--min_length",
                 default=200,
                 type="int",
                 help="Minimum length to consider [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    evidencefile, scaffolds, contigs = args
    min_length = opts.min_length
    splitfasta, oagp, cagp = gaps(
        [scaffolds, "--split", "--mingap={0}".format(opts.mingap)])

    agp = AGP(cagp)
    p = agp.graph

    ef = EvidenceFile(evidencefile, contigs)
    sizes = ef.sz
    q = ef.graph

    logging.debug("Reference graph: {0}".format(p))
    logging.debug("Patch graph: {0}".format(q))

    newagp = deepcopy(agp)

    seen = set()
    deleted = set()
    for a in agp:
        if a.is_gap:
            continue

        name = a.component_id
        object = a.object
        if name in deleted:
            print >> sys.stderr, "* Skip {0}, already embedded".format(name)
            continue

        seen.add(name)

        target_name, tag = get_target(p, name)
        path = q.get_path(name, target_name, tag=tag)
        path_size = sum([sizes[x.v] for x, t in path]) if path else None
        status = NO_UPDATE

        # Heuristic, the patch must not be too long
        if path and path_size > min_length and len(path) > 3:
            path = None

        if not path:
            print >> sys.stderr, name, target_name, path, path_size, status
            continue

        backward = False
        for x, t in path:
            if x.v in seen:
                print >> sys.stderr, "* Does not allow backward" \
                                     " patch on {0}".format(x.v)
                backward = True
                break

        if backward:
            continue

        # Build the path plus the ends
        vv = q.get_node(name)
        path.appendleft((vv, tag))
        if tag == ">":
            path.reverse()
            status = INSERT_BEFORE
        elif target_name is None:
            status = INSERT_AFTER
        else:
            target = q.get_node(target_name)
            path.append((target, tag))
            status = INSERT_BETWEEN

        print >> sys.stderr, name, target_name, path, path_size, status

        # Trim the ends off from the constructed AGPLines
        lines = path_to_agp(q, path, object, sizes, status)
        if status == INSERT_BEFORE:
            lines = lines[:-1]
            td = newagp.insert_lines(name, lines, \
                                 delete=True, verbose=True)
        elif status == INSERT_AFTER:
            lines = lines[1:]
            td = newagp.insert_lines(name, lines, after=True, \
                                 delete=True, verbose=True)
        else:
            lines = lines[1:-1]
            td = newagp.update_between(name, target_name, lines, \
                                 delete=True, verbose=True)
        deleted |= td
        seen |= td

    # Recruite big singleton contigs
    CUTOFF = opts.min_length
    for ctg, size in sizes.items():
        if ctg in seen:
            continue
        if size < CUTOFF:
            continue
        newagp.append(AGPLine.cline(ctg, ctg, sizes, "?"))

    # Write a new AGP file
    newagpfile = "embedded.agp"
    newagp.print_to_file(newagpfile, index=True)
    tidy([newagpfile, contigs])
Example #38
0
def histogram(args):
    """
    %prog histogram meryl.histogram species K

    Plot the histogram based on meryl K-mer distribution, species and N are
    only used to annotate the graphic.
    """
    p = OptionParser(histogram.__doc__)
    p.add_option("--vmin",
                 dest="vmin",
                 default=1,
                 type="int",
                 help="minimum value, inclusive [default: %default]")
    p.add_option("--vmax",
                 dest="vmax",
                 default=100,
                 type="int",
                 help="maximum value, inclusive [default: %default]")
    p.add_option("--pdf",
                 default=False,
                 action="store_true",
                 help="Print PDF instead of ASCII plot [default: %default]")
    p.add_option("--coverage",
                 default=0,
                 type="int",
                 help="Kmer coverage [default: auto]")
    p.add_option("--nopeaks",
                 default=False,
                 action="store_true",
                 help="Do not annotate K-mer peaks")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    histfile, species, N = args
    ascii = not opts.pdf
    peaks = not opts.nopeaks
    N = int(N)

    if histfile.rsplit(".", 1)[-1] in ("mcdat", "mcidx"):
        logging.debug("CA kmer index found")
        histfile = merylhistogram(histfile)

    ks = KmerSpectrum(histfile)
    ks.analyze(K=N)

    Total_Kmers = int(ks.totalKmers)
    coverage = opts.coverage
    Kmer_coverage = ks.max2 if not coverage else coverage
    Genome_size = int(round(Total_Kmers * 1. / Kmer_coverage))

    Total_Kmers_msg = "Total {0}-mers: {1}".format(N, thousands(Total_Kmers))
    Kmer_coverage_msg = "{0}-mer coverage: {1}".format(N, Kmer_coverage)
    Genome_size_msg = "Estimated genome size: {0:.1f}Mb".\
                        format(Genome_size / 1e6)
    Repetitive_msg = ks.repetitive
    SNPrate_msg = ks.snprate

    for msg in (Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg):
        print(msg, file=sys.stderr)

    x, y = ks.get_xy(opts.vmin, opts.vmax)
    title = "{0} {1}-mer histogram".format(species, N)

    if ascii:
        asciiplot(x, y, title=title)
        return Genome_size

    plt.figure(1, (6, 6))
    plt.plot(x, y, 'g-', lw=2, alpha=.5)
    ax = plt.gca()

    if peaks:
        t = (ks.min1, ks.max1, ks.min2, ks.max2, ks.min3)
        tcounts = [(x, y) for x, y in ks.counts if x in t]
        if tcounts:
            x, y = zip(*tcounts)
            tcounts = dict(tcounts)
            plt.plot(x, y, 'ko', lw=2, mec='k', mfc='w')
            ax.text(ks.max1, tcounts[ks.max1], "SNP peak", va="top")
            ax.text(ks.max2, tcounts[ks.max2], "Main peak")

    messages = [
        Total_Kmers_msg, Kmer_coverage_msg, Genome_size_msg, Repetitive_msg,
        SNPrate_msg
    ]
    write_messages(ax, messages)

    ymin, ymax = ax.get_ylim()
    ymax = ymax * 7 / 6

    ax.set_title(markup(title))
    ax.set_ylim((ymin, ymax))
    xlabel, ylabel = "Coverage (X)", "Counts"
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    set_human_axis(ax)

    imagename = histfile.split(".")[0] + ".pdf"
    savefig(imagename, dpi=100)

    return Genome_size
Example #39
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K",
                 default=23,
                 type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage",
                 default=40,
                 type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix",
                 default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist",
                 default=False,
                 action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_home("jellyfish")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    jfcmd = op.join(opts.jellyfish_home, "jellyfish")
    cmd = jfcmd
    cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Example #40
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--conf", help="BAMBUS configuration file [default: %default]")
    p.add_option("--prefix", default=False, action="store_true",
            help="Only keep links between IDs with same prefix [default: %default]")
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    rclip = opts.rclip
    ctgfasta = args[0]
    duos = list(grouper(args[1:], 2))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [bedfile, "--lib", "--nointra",
                        "--rclip={0}".format(rclip),
                        "--cutoff={0}".format(opts.cutoff)]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".\
            format(prefix)
    sh(cmd)

    final = "final"
    cmd = "printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib " \
          "-merge -detail -oo -sum -o {1}".format(prefix, final)
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Example #41
0
def ld(args):
    """
    %prog ld map

    Calculate pairwise linkage disequilibrium given MSTmap.
    """
    from random import sample
    from jcvi.algorithms.matrix import symmetrize

    p = OptionParser(ld.__doc__)
    p.add_option(
        "--subsample",
        default=1000,
        type="int",
        help="Subsample markers to speed up",
    )
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (mstmap, ) = args
    subsample = opts.subsample
    data = MSTMap(mstmap)

    markerbedfile = mstmap + ".subsample.bed"
    ldmatrix = mstmap + ".subsample.matrix"
    # Take random subsample while keeping marker order
    if subsample < data.nmarkers:
        data = [data[x] for x in sorted(sample(range(len(data)), subsample))]
    else:
        logging.debug("Use all markers, --subsample ignored")

    nmarkers = len(data)
    if need_update(mstmap, (ldmatrix, markerbedfile)):
        fw = open(markerbedfile, "w")
        print("\n".join(x.bedline for x in data), file=fw)
        logging.debug("Write marker set of size {0} to file `{1}`.".format(
            nmarkers, markerbedfile))
        fw.close()

        M = np.zeros((nmarkers, nmarkers), dtype=float)
        for i, j in combinations(range(nmarkers), 2):
            a = data[i]
            b = data[j]
            M[i, j] = calc_ldscore(a.genotype, b.genotype)

        M = symmetrize(M)

        logging.debug("Write LD matrix to file `{0}`.".format(ldmatrix))
        M.tofile(ldmatrix)
    else:
        nmarkers = len(Bed(markerbedfile))
        M = np.fromfile(ldmatrix, dtype="float").reshape(nmarkers, nmarkers)
        logging.debug("LD matrix `{0}` exists ({1}x{1}).".format(
            ldmatrix, nmarkers))

    from jcvi.graphics.base import plt, savefig, Rectangle, draw_cmap

    plt.rcParams["axes.linewidth"] = 0

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # the heatmap

    ax.matshow(M, cmap=iopts.cmap)

    # Plot chromosomes breaks
    bed = Bed(markerbedfile)
    xsize = len(bed)
    extent = (0, nmarkers)
    chr_labels = []
    ignore_size = 20

    for (seqid, beg, end) in bed.get_breaks():
        ignore = abs(end - beg) < ignore_size
        pos = (beg + end) / 2
        chr_labels.append((seqid, pos, ignore))
        if ignore:
            continue
        ax.plot((end, end), extent, "w-", lw=1)
        ax.plot(extent, (end, end), "w-", lw=1)

    # Plot chromosome labels
    for label, pos, ignore in chr_labels:
        pos = 0.1 + pos * 0.8 / xsize
        if not ignore:
            root.text(pos,
                      0.91,
                      label,
                      ha="center",
                      va="bottom",
                      rotation=45,
                      color="grey")
            root.text(0.09, pos, label, ha="right", va="center", color="grey")

    ax.set_xlim(extent)
    ax.set_ylim(extent)
    ax.set_axis_off()

    draw_cmap(root, "Pairwise LD (r2)", 0, 1, cmap=iopts.cmap)

    root.add_patch(Rectangle((0.1, 0.1), 0.8, 0.8, fill=False, ec="k", lw=2))
    m = mstmap.split(".")[0]
    root.text(0.5,
              0.06,
              "Linkage Disequilibrium between {0} markers".format(m),
              ha="center")

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = m + ".subsample" + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #42
0
def multihistogram(args):
    """
    %prog multihistogram *.histogram species

    Plot the histogram based on a set of K-mer hisotograms. The method is based
    on Star et al.'s method (Atlantic Cod genome paper).
    """
    p = OptionParser(multihistogram.__doc__)
    p.add_option("--kmin",
                 default=15,
                 type="int",
                 help="Minimum K-mer size, inclusive")
    p.add_option("--kmax",
                 default=30,
                 type="int",
                 help="Maximum K-mer size, inclusive")
    p.add_option("--vmin",
                 default=2,
                 type="int",
                 help="Minimum value, inclusive")
    p.add_option("--vmax",
                 default=100,
                 type="int",
                 help="Maximum value, inclusive")
    opts, args, iopts = p.set_image_options(args, figsize="10x5", dpi=300)

    if len(args) < 1:
        sys.exit(not p.print_help())

    histfiles = args[:-1]
    species = args[-1]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])
    A = fig.add_axes([.08, .12, .38, .76])
    B = fig.add_axes([.58, .12, .38, .76])

    lines = []
    legends = []
    genomesizes = []
    for histfile in histfiles:
        ks = KmerSpectrum(histfile)
        x, y = ks.get_xy(opts.vmin, opts.vmax)
        K = get_number(op.basename(histfile).split(".")[0].split("-")[-1])
        if not opts.kmin <= K <= opts.kmax:
            continue

        line, = A.plot(x, y, '-', lw=1)
        lines.append(line)
        legends.append("K = {0}".format(K))
        ks.analyze(K=K)
        genomesizes.append((K, ks.genomesize / 1e6))

    leg = A.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(.5)

    title = "{0} genome K-mer histogram".format(species)
    A.set_title(markup(title))
    xlabel, ylabel = "Coverage (X)", "Counts"
    A.set_xlabel(xlabel)
    A.set_ylabel(ylabel)
    set_human_axis(A)

    title = "{0} genome size estimate".format(species)
    B.set_title(markup(title))
    x, y = zip(*genomesizes)
    B.plot(x, y, "ko", mfc='w')
    t = np.linspace(opts.kmin - .5, opts.kmax + .5, 100)
    p = np.poly1d(np.polyfit(x, y, 2))
    B.plot(t, p(t), "r:")

    xlabel, ylabel = "K-mer size", "Estimated genome size (Mb)"
    B.set_xlabel(xlabel)
    B.set_ylabel(ylabel)
    set_ticklabels_helvetica(B)

    labels = ((.04, .96, 'A'), (.54, .96, 'B'))
    panel_labels(root, labels)

    normalize_axes(root)
    imagename = species + ".multiK.pdf"
    savefig(imagename, dpi=iopts.dpi, iopts=iopts)
Example #43
0
def demo(args):
    """
    %prog demo

    Draw sample gene features to illustrate the various fates of duplicate
    genes - to be used in a book chapter.
    """
    p = OptionParser(demo.__doc__)
    opts, args = p.parse_args(args)

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    panel_space = .23
    dup_space = .025
    # Draw a gene and two regulatory elements at these arbitrary locations
    locs = [
        (.5, .9),  # ancestral gene
        (.5, .9 - panel_space + dup_space),  # identical copies
        (.5, .9 - panel_space - dup_space),
        (.5, .9 - 2 * panel_space + dup_space),  # degenerate copies
        (.5, .9 - 2 * panel_space - dup_space),
        (.2, .9 - 3 * panel_space + dup_space),  # sub-functionalization
        (.2, .9 - 3 * panel_space - dup_space),
        (.5, .9 - 3 * panel_space + dup_space),  # neo-functionalization
        (.5, .9 - 3 * panel_space - dup_space),
        (.8, .9 - 3 * panel_space + dup_space),  # non-functionalization
        (.8, .9 - 3 * panel_space - dup_space),
    ]

    default_regulator = "gm"
    regulators = [
        default_regulator,
        default_regulator,
        default_regulator,
        "wm",
        default_regulator,
        "wm",
        "gw",
        "wb",
        default_regulator,
        "ww",
        default_regulator,
    ]

    width = .24
    for i, (xx, yy) in enumerate(locs):
        regulator = regulators[i]
        x1, x2 = xx - .5 * width, xx + .5 * width
        Glyph(root, x1, x2, yy)
        if i == 9:  # upper copy for non-functionalization
            continue

        # coding region
        x1, x2 = xx - .16 * width, xx + .45 * width
        Glyph(root, x1, x2, yy, fc="k")

        # two regulatory elements
        x1, x2 = xx - .4 * width, xx - .28 * width
        for xx, fc in zip((x1, x2), regulator):
            if fc == 'w':
                continue

            DoubleCircle(root, xx, yy, fc=fc)

        rotation = 30
        tip = .02
        if i == 0:
            ya = yy + tip
            root.text(x1, ya, "Flower", rotation=rotation, va="bottom")
            root.text(x2, ya, "Root", rotation=rotation, va="bottom")
        elif i == 7:
            ya = yy + tip
            root.text(x2, ya, "Leaf", rotation=rotation, va="bottom")

    # Draw arrows between panels (center)
    arrow_dist = .08
    ar_xpos = .5
    for ar_ypos in (.3, .53, .76):
        root.annotate(" ", (ar_xpos, ar_ypos), (ar_xpos, ar_ypos + arrow_dist),
                      arrowprops=arrowprops)

    ar_ypos = .3
    for ar_xpos in (.2, .8):
        root.annotate(" ", (ar_xpos, ar_ypos), (.5, ar_ypos + arrow_dist),
                      arrowprops=arrowprops)

    # Duplication, Degeneration
    xx = .6
    ys = (.76, .53)
    processes = ("Duplication", "Degeneration")
    for yy, process in zip(ys, processes):
        root.text(xx, yy + .02, process, fontweight="bold")

    # Label of fates
    xs = (.2, .5, .8)
    fates = ("Subfunctionalization", "Neofunctionalization",
             "Nonfunctionalization")
    yy = .05
    for xx, fate in zip(xs, fates):
        RoundLabel(root, xx, yy, fate)

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = "demo.pdf"
    savefig(figname, dpi=300)
Example #44
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci",
                 default=2,
                 type="int",
                 help="Exclude kmers with less than ci counts")
    p.add_option("--cs",
                 default=2,
                 type="int",
                 help="Maximal value of a counter")
    p.add_option("--cx",
                 default=None,
                 type="int",
                 help="Exclude kmers with more than cx counts")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Input is single-end data, only one FASTQ/FASTA")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Input is FASTA instead of FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \
              "*.fq,*.fq.gz,*.fastq,*.fastq.gz"

    mm = MakeManager()
    for p, pf in iter_project(folder, pattern=pattern, n=n,
                              commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print("\n".join(p), file=fw)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        if opts.cx:
            cmd += " -cx{}".format(opts.cx)
        if opts.fasta:
            cmd += " -fm"
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Example #45
0
File: cdt.py Project: zjwang6/jcvi
            grouper.join(g.parent, g.left_child, g.right_child)

        parents = {}
        for i, group in enumerate(grouper):
            for g in group:
                parents[g] = i

        partitions = [[parents.get(a, x), x] for a, x in names]
        for key, parts in groupby(partitions, key=lambda x: x[0]):
            yield list(x[1] for x in parts)


def main(args):

    cdt_file, nwk_file = args
    cdt = CDT(cdt_file)
    cdt.get_gtr_tree()
    cdt.print_newick(nwk_file)


if __name__ == '__main__':

    p = OptionParser(__doc__)
    opts, args = p.parse_args()

    if len(args) != 2:
        sys.exit(not p.print_help())

    main(args)
Example #46
0
def dotplot(args):
    """
    %prog dotplot map.csv ref.fasta

    Make dotplot between chromosomes and linkage maps.
    The input map is csv formatted, for example:

    ScaffoldID,ScaffoldPosition,LinkageGroup,GeneticPosition
    scaffold_2707,11508,1,0
    scaffold_2707,11525,1,1.2
    """
    from natsort import natsorted
    from jcvi.assembly.allmaps import CSVMapLine
    from jcvi.formats.sizes import Sizes
    from jcvi.graphics.base import shorten
    from jcvi.graphics.dotplot import (
        plt,
        savefig,
        markup,
        normalize_axes,
        downsample,
        plot_breaks_and_labels,
        thousands,
    )

    p = OptionParser(dotplot.__doc__)
    p.set_outfile(outfile=None)
    opts, args, iopts = p.set_image_options(args,
                                            figsize="8x8",
                                            style="dark",
                                            dpi=90,
                                            cmap="copper")

    if len(args) != 2:
        sys.exit(not p.print_help())

    csvfile, fastafile = args
    sizes = natsorted(Sizes(fastafile).mapping.items())
    seen = set()
    raw_data = []

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])  # the whole canvas
    ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])  # the dot plot

    fp = must_open(csvfile)
    for row in fp:
        m = CSVMapLine(row)
        seen.add(m.seqid)
        raw_data.append(m)

    # X-axis is the genome assembly
    ctgs, ctg_sizes = zip(*sizes)
    xsize = sum(ctg_sizes)
    qb = list(np.cumsum(ctg_sizes))
    qbreaks = list(zip(ctgs, [0] + qb, qb))
    qstarts = dict(zip(ctgs, [0] + qb))

    # Y-axis is the map
    key = lambda x: x.lg
    raw_data.sort(key=key)
    ssizes = {}
    for lg, d in groupby(raw_data, key=key):
        ssizes[lg] = max([x.cm for x in d])
    ssizes = natsorted(ssizes.items())
    lgs, lg_sizes = zip(*ssizes)
    ysize = sum(lg_sizes)
    sb = list(np.cumsum(lg_sizes))
    sbreaks = list(zip([("LG" + x) for x in lgs], [0] + sb, sb))
    sstarts = dict(zip(lgs, [0] + sb))

    # Re-code all the scatter dots
    data = [(qstarts[x.seqid] + x.pos, sstarts[x.lg] + x.cm, "g")
            for x in raw_data if (x.seqid in qstarts)]
    npairs = downsample(data)

    x, y, c = zip(*data)
    ax.scatter(x, y, c=c, edgecolors="none", s=2, lw=0)

    # Flip X-Y label
    gy, gx = op.basename(csvfile).split(".")[:2]
    gx, gy = shorten(gx, maxchar=30), shorten(gy, maxchar=30)
    xlim, ylim = plot_breaks_and_labels(fig, root, ax, gx, gy, xsize, ysize,
                                        qbreaks, sbreaks)
    ax.set_xlim(xlim)
    ax.set_ylim(ylim)

    title = "Alignment: {} vs {}".format(gx, gy)
    title += " ({} markers)".format(thousands(npairs))
    root.set_title(markup(title), x=0.5, y=0.96, color="k")
    logging.debug(title)
    normalize_axes(root)

    image_name = opts.outfile or (csvfile.rsplit(".", 1)[0] + "." +
                                  iopts.format)
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
    fig.clear()
Example #47
0
def rename(args):
    """
    %prog rename genes.bed [gaps.bed]

    Rename genes for annotation release.

    For genes on chromosomes (e.g. the 12th gene on C1):
    Bo1g00120

    For genes on scaffolds (e.g. the 12th gene on unplaced Scaffold00285):
    Bo00285s120

    The genes identifiers will increment by 10. So assuming no gap, these are
    the consecutive genes:
    Bo1g00120, Bo1g00130, Bo1g00140...
    Bo00285s120, Bo00285s130, Bo00285s140...

    When we encounter gaps, we would like the increment to be larger. For example,
    Bo1g00120, <gap>, Bo1g01120...

    Gaps bed file is optional.
    """
    import string

    p = OptionParser(rename.__doc__)
    p.add_option(
        "-a",
        dest="gene_increment",
        default=10,
        type="int",
        help="Increment for continuous genes",
    )
    p.add_option(
        "-b",
        dest="gap_increment",
        default=1000,
        type="int",
        help="Increment for gaps",
    )
    p.add_option(
        "--pad0",
        default=6,
        type="int",
        help="Pad gene identifiers with 0",
    )
    p.add_option(
        "--spad0",
        default=4,
        type="int",
        help="Pad gene identifiers on small scaffolds",
    )
    p.add_option("--prefix", default="Bo", help="Genome prefix")
    p.add_option(
        "--jgi",
        default=False,
        action="store_true",
        help="Create JGI style identifier PREFIX.NN[G|TE]NNNNN.1",
    )
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    genebed = args[0]
    gapbed = args[1] if len(args) == 2 else None
    prefix = opts.prefix
    gene_increment = opts.gene_increment
    gap_increment = opts.gap_increment

    genes = Bed(genebed)
    if gapbed:
        fp = open(gapbed)
        for row in fp:
            genes.append(BedLine(row))

    genes.sort(key=genes.key)
    idsfile = prefix + ".ids"
    newbedfile = prefix + ".bed"
    gap_increment -= gene_increment
    assert gap_increment >= 0

    if opts.jgi:
        prefix += "."
    fw = open(idsfile, "w")
    for chr, lines in groupby(genes, key=lambda x: x.seqid):
        lines = list(lines)
        pad0 = opts.pad0 if len(lines) > 1000 else opts.spad0
        isChr = chr[0].upper() == "C"
        digits = "".join(x for x in chr if x in string.digits)
        gs = "g" if isChr else "s"
        pp = prefix + digits + gs
        idx = 0
        if isChr:
            idx += gap_increment

        for r in lines:
            isGap = r.strand not in ("+", "-")
            if isGap:
                idx += gap_increment
                continue
            else:
                idx += gene_increment
            accn = pp + "{0:0{1}d}".format(idx, pad0)
            oldaccn = r.accn
            print("\t".join((oldaccn, accn)), file=fw)
            r.accn = accn

    genes.print_to_file(newbedfile)
    logging.debug("Converted IDs written to `{0}`.".format(idsfile))
    logging.debug("Converted bed written to `{0}`.".format(newbedfile))
Example #48
0
def main(args):
    p = OptionParser(__doc__)

    p.set_beds()
    p.add_option(
        "--quota",
        default="1:1",
        help="`quota mapping` procedure -- screen blocks to constrain mapping"
        " (useful for orthology), "
        "put in the format like (#subgenomes expected for genome X):"
        "(#subgenomes expected for genome Y)",
    )
    p.add_option(
        "--Nm",
        dest="Nmax",
        type="int",
        default=10,
        help="distance cutoff to tolerate two blocks that are "
        "slightly overlapping (cutoff for `quota mapping`) "
        "[default: %default units (gene or bp dist)]",
    )

    p.add_option(
        "--self",
        dest="self_match",
        action="store_true",
        default=False,
        help="you might turn this on when screening paralogous blocks, "
        "esp. if you have reduced mirrored blocks into non-redundant set",
    )
    p.set_verbose(help="Show verbose solver output")

    p.add_option(
        "--screen",
        default=False,
        action="store_true",
        help="generate new anchors file",
    )

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (qa_file, ) = args
    _, _, qorder, sorder, _ = check_beds(qa_file, p, opts)

    # sanity check for the quota
    if opts.quota:
        try:
            qa, qb = opts.quota.split(":")
            qa, qb = int(qa), int(qb)
        except ValueError:
            logging.error(
                "quota string should be the form x:x (2:4, 1:3, etc.)")
            sys.exit(1)

        if opts.self_match and qa != qb:
            raise Exception("when comparing genome to itself, "
                            "quota must be the same number "
                            "(like 1:1, 2:2) you have %s" % opts.quota)
        quota = (qa, qb)

    self_match = opts.self_match

    clusters = read_clusters(qa_file, qorder, sorder)
    for cluster in clusters:
        assert len(cluster) > 0

    # below runs `quota mapping`
    work_dir = op.join(op.dirname(op.abspath(qa_file)), "work")

    selected_ids = solve_lp(
        clusters,
        quota,
        work_dir=work_dir,
        Nmax=opts.Nmax,
        self_match=self_match,
        verbose=opts.verbose,
    )

    logging.debug("Selected %d blocks", len(selected_ids))
    prefix = qa_file.rsplit(".", 1)[0]
    suffix = "{}x{}".format(qa, qb)
    outfile = ".".join((prefix, suffix))
    fw = must_open(outfile, "w")
    print(",".join(str(x) for x in selected_ids), file=fw)
    fw.close()
    logging.debug("Screened blocks ids written to `%s`", outfile)

    if opts.screen:
        from jcvi.compara.synteny import screen

        new_qa_file = ".".join((prefix, suffix, "anchors"))
        largs = [qa_file, new_qa_file, "--ids", outfile]
        if opts.qbed and opts.sbed:
            largs += ["--qbed={0}".format(opts.qbed)]
            largs += ["--sbed={0}".format(opts.sbed)]
        screen(largs)
Example #49
0
def renumber(args):
    """
    %prog renumber Mt35.consolidated.bed > tagged.bed

    Renumber genes for annotation updates.
    """
    from jcvi.algorithms.lis import longest_increasing_subsequence
    from jcvi.utils.grouper import Grouper

    p = OptionParser(renumber.__doc__)
    p.set_annot_reformat_opts()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bedfile, ) = args

    pf = bedfile.rsplit(".", 1)[0]
    abedfile = pf + ".a.bed"
    bbedfile = pf + ".b.bed"
    if need_update(bedfile, (abedfile, bbedfile)):
        prepare(bedfile)

    mbed = Bed(bbedfile)
    g = Grouper()
    for s in mbed:
        accn = s.accn
        g.join(*accn.split(";"))

    bed = Bed(abedfile)
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        ranks = []
        gg = set()
        for s in sbed:
            accn = s.accn
            achr, arank = atg_name(accn)
            if achr != current_chr:
                continue
            ranks.append(arank)
            gg.add(accn)

        lranks = longest_increasing_subsequence(ranks)
        print(
            current_chr,
            len(sbed),
            "==>",
            len(ranks),
            "==>",
            len(lranks),
            file=sys.stderr,
        )

        granks = set(
            gene_name(
                current_chr, x, prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
            for x in lranks) | set(
                gene_name(current_chr,
                          x,
                          prefix=opts.prefix,
                          pad0=opts.pad0,
                          sep="te",
                          uc=opts.uc) for x in lranks)

        tagstore = {}
        for s in sbed:
            achr, arank = atg_name(s.accn)
            accn = s.accn
            if accn in granks:
                tag = (accn, FRAME)
            elif accn in gg:
                tag = (accn, RETAIN)
            else:
                tag = (".", NEW)

            tagstore[accn] = tag

        # Find cases where genes overlap
        for s in sbed:
            accn = s.accn
            gaccn = g[accn]
            tags = [((tagstore[x][-1] if x in tagstore else NEW), x)
                    for x in gaccn]
            group = [(PRIORITY.index(tag), x) for tag, x in tags]
            best = min(group)[-1]

            if accn != best:
                tag = (best, OVERLAP)
            else:
                tag = tagstore[accn]

            print("\t".join((str(s), "|".join(tag))))
Example #50
0
def main():
    """
    %prog database.fa query.fa [options]

    Wrapper for NCBI BLAST+.
    """
    p = OptionParser(main.__doc__)

    p.add_option("--format", default=" \'6 qseqid sseqid pident length " \
            "mismatch gapopen qstart qend sstart send evalue bitscore\' ",
            help="0-11, learn more with \"blastp -help\". [default: %default]")
    p.add_option("--path", dest="blast_path", default=None,
            help="specify BLAST+ path including the program name")
    p.add_option("--prog", dest="blast_program", default="blastp",
            help="specify BLAST+ program to use. See complete list here: " \
            "http://www.ncbi.nlm.nih.gov/books/NBK52640/#chapter1.Installation"
            " [default: %default]")
    p.set_align(evalue=.01)
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")

    p.set_cpus()
    p.set_params()
    p.set_outfile()
    opts, args = p.parse_args()

    if len(args) != 2 or opts.blast_program is None:
        sys.exit(not p.print_help())

    bfasta_fn, afasta_fn = args
    for fn in (afasta_fn, bfasta_fn):
        assert op.exists(fn)

    afasta_fn = op.abspath(afasta_fn)
    bfasta_fn = op.abspath(bfasta_fn)
    out_fh = must_open(opts.outfile, "w")

    extra = opts.extra
    blast_path = opts.blast_path
    blast_program = opts.blast_program

    blast_bin = blast_path or blast_program
    if op.basename(blast_bin) != blast_program:
        blast_bin = "".join([blast_bin, "/", blast_program])

    cpus = opts.cpus
    if cpus > 1:
        logging.debug("Dispatch job to %d cpus" % cpus)
        outdir = "outdir"
        fs = split([afasta_fn, outdir, str(cpus)])
        queries = fs.names
    else:
        queries = [afasta_fn]

    dbtype = "prot" if op.basename(blast_bin) in ["blastp", "blastx"] \
        else "nucl"

    db = bfasta_fn
    if dbtype == "prot":
        nin = db + ".pin"
    else:
        nin = db + ".nin"
        nin00 = db + ".00.nin"
        nin = nin00 if op.exists(nin00) else (db + ".nin")

    run_formatdb(infile=db, outfile=nin, dbtype=dbtype)

    lock = Lock()

    blastplus_template = "{0} -db {1} -outfmt {2}"
    blast_cmd = blastplus_template.format(blast_bin, bfasta_fn, opts.format)
    blast_cmd += " -evalue {0} -max_target_seqs {1}".\
        format(opts.evalue, opts.best)
    if extra:
        blast_cmd += " " + extra.strip()

    args = [(k + 1, cpus, out_fh, blast_cmd, query, lock) \
                for k, query in zip(range(cpus), queries)]
    g = Jobs(target=blastplus, args=args)
    g.run()
Example #51
0
def plot(args):
    """
    %prog plot tagged.new.bed chr1

    Plot gene identifiers along a particular chromosome, often to illustrate the
    gene id assignment procedure.
    """
    from jcvi.graphics.base import plt, savefig
    from jcvi.graphics.chromosome import ChromosomeMap

    p = OptionParser(plot.__doc__)
    p.add_option("--firstn", type="int", help="Only plot the first N genes")
    p.add_option("--ymax", type="int", help="Y-axis max value")
    p.add_option("--log", action="store_true", help="Write plotting data")
    opts, args, iopts = p.set_image_options(args, figsize="6x4")

    if len(args) != 2:
        sys.exit(not p.print_help())

    taggedbed, chr = args
    bed = Bed(taggedbed)
    beds = list(bed.sub_bed(chr))
    old, new = [], []
    i = 0
    for b in beds:
        accn = b.extra[0]
        if "te" in accn:
            continue

        accn, tag = accn.split("|")
        if tag == "OVERLAP":
            continue

        c, r = atg_name(accn)
        if tag == "NEW":
            new.append((i, r))
        else:
            old.append((i, r))
        i += 1

    ngenes = i
    assert ngenes == len(new) + len(old)

    logging.debug("Imported {0} ranks on {1}.".format(ngenes, chr))
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    xstart, xend = 0.2, 0.8
    ystart, yend = 0.2, 0.8
    pad = 0.02

    ngenes = opts.firstn or ngenes
    ymax = opts.ymax or 500000

    title = "Assignment of Medtr identifiers"
    if opts.ymax:
        subtitle = "{0}, first {1} genes".format(chr, ngenes)
    else:
        subtitle = "{0}, {1} genes ({2} new)".format(chr, ngenes, len(new))

    chr_map = ChromosomeMap(fig, root, xstart, xend, ystart, yend, pad, 0,
                            ymax, 5, title, subtitle)

    ax = chr_map.axes

    if opts.log:
        from jcvi.utils.table import write_csv

        header = ["x", "y"]
        write_csv(header, new, filename=chr + ".new")
        write_csv(header, old, filename=chr + ".old")

    x, y = zip(*new)
    ax.plot(x, y, "b,")
    x, y = zip(*old)
    ax.plot(x, y, "r,")

    # Legends
    ymid = (ystart + yend) / 2
    y = ymid + pad
    root.plot([0.2], [y], "r.", lw=2)
    root.text(0.2 + pad, y, "Existing Medtr ids", va="center", size=10)
    y = ymid - pad
    root.plot([0.2], [y], "b.", lw=2)
    root.text(0.2 + pad, y, "Newly instantiated ids", va="center", size=10)

    ax.set_xlim(0, ngenes)
    ax.set_ylim(0, ymax)
    ax.set_axis_off()

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = chr + ".identifiers." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Example #52
0
def annotate(args):
    """
    %prog annotate new.bed old.bed 2> log

    Annotate the `new.bed` with features from `old.bed` for the purpose of
    gene numbering.

    Ambiguity in ID assignment can be resolved by either of the following 2 methods:
    - `alignment`: make use of global sequence alignment score (calculated by `needle`)
    - `overlap`: make use of overlap length (calculated by `intersectBed`)

    Transfer over as many identifiers as possible while following guidelines:
    http://www.arabidopsis.org/portals/nomenclature/guidelines.jsp#editing

    Note: Following RegExp pattern describes the structure of the identifier
    assigned to features in the `new.bed` file.

    new_id_pat = re.compile(r"^\d+\.[cemtx]+\S+")

    Examples: 23231.m312389, 23231.t004898, 23231.tRNA.144
    Adjust the value of `new_id_pat` manually as per your ID naming conventions.
    """
    from jcvi.utils.grouper import Grouper

    valid_resolve_choices = ["alignment", "overlap"]

    p = OptionParser(annotate.__doc__)
    p.add_option(
        "--resolve",
        default="alignment",
        choices=valid_resolve_choices,
        help="Resolve ID assignment based on a certain metric",
    )
    p.add_option(
        "--atg_name",
        default=False,
        action="store_true",
        help="Specify is locus IDs in `new.bed` file follow ATG nomenclature",
    )

    g1 = OptionGroup(
        p,
        "Optional parameters (alignment):\n" +
        "Use if resolving ambiguities based on sequence `alignment`",
    )
    g1.add_option(
        "--pid",
        dest="pid",
        default=35.0,
        type="float",
        help="Percent identity cutoff",
    )
    g1.add_option(
        "--score",
        dest="score",
        default=250.0,
        type="float",
        help="Alignment score cutoff",
    )
    p.add_option_group(g1)

    g2 = OptionGroup(
        p,
        "Optional parameters (overlap):\n" +
        "Use if resolving ambiguities based on `overlap` length\n" +
        "Parameters equivalent to `intersectBed`",
    )
    g2.add_option(
        "-f",
        dest="f",
        default=0.5,
        type="float",
        help="Minimum overlap fraction (0.0 - 1.0)",
    )
    g2.add_option(
        "-r",
        dest="r",
        default=False,
        action="store_true",
        help="Require fraction overlap to be reciprocal",
    )
    g2.add_option(
        "-s",
        dest="s",
        default=True,
        action="store_true",
        help="Require same strandedness",
    )
    p.add_option_group(g2)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    nbedfile, obedfile = args
    npf, opf = nbedfile.rsplit(".", 1)[0], obedfile.rsplit(".", 1)[0]

    # Make consolidated.bed
    cbedfile = "consolidated.bed"
    if not os.path.isfile(cbedfile):
        consolidate(nbedfile, obedfile, cbedfile)
    else:
        logging.warning("`{0}` already exists. Skipping step".format(cbedfile))

    logging.warning("Resolving ID assignment ambiguity based on `{0}`".format(
        opts.resolve))

    if opts.resolve == "alignment":
        # Get pairs and prompt to run needle
        pairsfile = "nw.pairs"
        scoresfile = "nw.scores"
        if not os.path.isfile(pairsfile):
            get_pairs(cbedfile, pairsfile)
        else:
            logging.warning(
                "`{0}` already exists. Checking for needle output".format(
                    pairsfile))

        # If needle scores do not exist, prompt user to run needle
        if not os.path.isfile(scoresfile):
            logging.error(
                "`{0}` does not exist. Please process {1} using `needle`".
                format(scoresfile, pairsfile))
            sys.exit()
    else:
        scoresfile = "ovl.scores"
        # Calculate overlap length using intersectBed
        calculate_ovl(nbedfile, obedfile, opts, scoresfile)

    logging.warning(
        "`{0}' exists. Storing scores in memory".format(scoresfile))
    scores = read_scores(scoresfile, opts)

    # Iterate through consolidated bed and
    # filter piles based on score
    abedline = {}

    cbed = Bed(cbedfile)
    g = Grouper()
    for c in cbed:
        accn = c.accn
        g.join(*accn.split(";"))

    nbedline = {}
    nbed = Bed(nbedfile)
    for line in nbed:
        nbedline[line.accn] = line

    splits = set()
    for chr, chrbed in nbed.sub_beds():
        abedline, splits = annotate_chr(chr, chrbed, g, scores, nbedline,
                                        abedline, opts, splits)

    if splits is not None:
        abedline = process_splits(splits, scores, nbedline, abedline)

    abedfile = npf + ".annotated.bed"
    afh = open(abedfile, "w")
    for accn in abedline:
        print(abedline[accn], file=afh)
    afh.close()

    sort([abedfile, "-i"])
Example #53
0
def reindex(args):
    """
    %prog reindex gffile pep.fasta ref.pep.fasta

    Reindex the splice isoforms (mRNA) in input GFF file, preferably
    generated after PASA annotation update

    In the input GFF file, there can be several types of mRNA within a locus:
    * CDS matches reference, UTR extended, inherits reference mRNA ID
    * CDS (slightly) different from reference, inherits reference mRNA ID
    * Novel isoform added by PASA, have IDs like "LOCUS.1.1", "LOCUS.1.2"
    * Multiple mRNA collapsed due to shared structure, have IDs like "LOCUS.1-LOCUS.1.1"

    In the case of multiple mRNA which have inherited the same reference mRNA ID,
    break ties by comparing the new protein with the reference protein using
    EMBOSS `needle` to decide which mRNA retains ID and which is assigned a new ID.

    All mRNA identifiers should follow the AGI naming conventions.

    When reindexing the isoform identifiers, order mRNA based on:
    * decreasing transcript length
    * decreasing support from multiple input datasets used to run pasa.consolidate()
    """
    from jcvi.formats.gff import make_index
    from jcvi.formats.fasta import Fasta
    from jcvi.apps.emboss import needle
    from jcvi.formats.base import FileShredder
    from tempfile import mkstemp

    p = OptionParser(reindex.__doc__)
    p.add_option("--scores",
                 type="str",
                 help="read from existing EMBOSS `needle` scores file")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gffile, pep, refpep, = args
    gffdb = make_index(gffile)
    reffasta = Fasta(refpep)

    if not opts.scores:
        fh, pairsfile = mkstemp(prefix="pairs", suffix=".txt", dir=".")
        fw = must_open(pairsfile, "w")

    conflict, novel = AutoVivification(), {}
    for gene in gffdb.features_of_type("gene", order_by=("seqid", "start")):
        geneid = atg_name(gene.id, retval="locus")
        novel[geneid] = []
        updated_mrna, hybrid_mrna = [], []
        for mrna in gffdb.children(gene,
                                   featuretype="mRNA",
                                   order_by=("seqid", "start")):
            if re.match(atg_name_pat,
                        mrna.id) is not None and "_" not in mrna.id:
                pf, mrnaid = parse_prefix(mrna.id)
                mlen = gffdb.children_bp(mrna, child_featuretype="exon")
                if "-" in mrna.id:
                    hybrid_mrna.append((mrna.id, mrna.start, mlen, len(pf)))
                else:
                    updated_mrna.append((mrna.id, mrna.start, mlen, len(pf)))

        for mrna in sorted(updated_mrna, key=lambda k: (k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            iso = atg_name(mrnaid, retval="iso")
            newiso = "{0}{1}".format(iso, re.sub(atg_name_pat, "", mrnaid))
            if iso == newiso:
                if iso not in conflict[geneid]:
                    conflict[geneid][iso] = []
                conflict[geneid][iso].append(
                    (mrna[0], iso, newiso, mstart, mlen, len(pf)))
            else:
                novel[geneid].append(
                    (mrna[0], None, newiso, mstart, mlen, len(pf)))

        for mrna in sorted(hybrid_mrna, key=lambda k: (k[1], -k[2], -k[3])):
            pf, mrnaid = parse_prefix(mrna[0])
            mstart, mlen = mrna[1], mrna[2]

            _iso, _newiso = [], []
            for id in sorted(mrnaid.split("-")):
                a = atg_name(id, retval="iso")
                b = "{0}{1}".format(a, re.sub(atg_name_pat, "", id))
                _iso.append(a)
                _newiso.append(b)

            _novel = None
            newiso = "-".join(str(x) for x in set(_newiso))
            for iso, niso in zip(_iso, _newiso):
                if iso == niso:
                    if iso not in conflict[geneid]:
                        conflict[geneid][iso] = [(mrna[0], iso, newiso, mstart,
                                                  mlen, len(pf))]
                        _novel = None
                        break

                _novel = True

            if _novel is not None:
                novel[geneid].append(
                    (mrna[0], None, newiso, mstart, mlen, len(pf)))

        if not opts.scores:
            for isoform in sorted(conflict[geneid]):
                mrnaid = "{0}.{1}".format(geneid, isoform)
                if mrnaid in reffasta.keys():
                    for mrna in conflict[geneid][isoform]:
                        print("\t".join(str(x) for x in (mrnaid, mrna[0])),
                              file=fw)

    scoresfile = None
    if not opts.scores:
        fw.close()
        needle([pairsfile, refpep, pep])
        FileShredder([pairsfile], verbose=False)
        scoresfile = "{0}.scores".format(pairsfile.rsplit(".")[0])
    else:
        scoresfile = opts.scores

    scores = read_scores(scoresfile, sort=True, trimsuffix=False)

    primary = {}
    for geneid in conflict:
        primary[geneid] = []
        for iso in sorted(conflict[geneid]):
            conflict[geneid][iso].sort(key=lambda k: (k[3], -k[4], -k[5]))
            _iso = "{0}.{1}".format(geneid, iso)
            if _iso not in scores:
                novel[geneid].extend(conflict[geneid][iso])
                continue
            top_score = scores[_iso][0][1]
            result = next(
                (i for i, v in enumerate(conflict[geneid][iso])
                 if v[0] == top_score),
                None,
            )
            if result is not None:
                primary[geneid].append(conflict[geneid][iso][result])
                del conflict[geneid][iso][result]
                if geneid not in novel:
                    novel[geneid] = []
                novel[geneid].extend(conflict[geneid][iso])
        novel[geneid].sort(key=lambda k: (k[3], -k[4], -k[5]))

    fw = must_open(opts.outfile, "w")
    for gene in gffdb.features_of_type("gene", order_by=("seqid", "start")):
        geneid = gene.id
        print(gene, file=fw)
        seen = []
        if geneid in primary:
            all_mrna = primary[geneid]
            all_mrna.extend(novel[geneid])
            for iso, mrna in enumerate(all_mrna):
                _mrna = gffdb[mrna[0]]
                _iso = mrna[1]
                if mrna not in novel[geneid]:
                    seen.append(int(mrna[1]))
                else:
                    mseen = 0 if len(seen) == 0 else max(seen)
                    _iso = (mseen + iso + 1) - len(seen)

                _mrnaid = "{0}.{1}".format(geneid, _iso)
                _mrna["ID"], _mrna["_old_ID"] = [_mrnaid], [_mrna.id]

                print(_mrna, file=fw)
                for c in gffdb.children(_mrna, order_by=("start")):
                    c["Parent"] = [_mrnaid]
                    print(c, file=fw)
        else:
            for feat in gffdb.children(gene, order_by=("seqid", "start")):
                print(feat, file=fw)

    fw.close()
Example #54
0
def instantiate(args):
    """
    %prog instantiate tagged.bed blacklist.ids big_gaps.bed

    instantiate NEW genes tagged by renumber.
    """
    p = OptionParser(instantiate.__doc__)
    p.set_annot_reformat_opts()
    p.add_option(
        "--extended_stride",
        default=False,
        action="store_true",
        help="Toggle extended strides for gene numbering",
    )
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    taggedbed, blacklist, gapsbed = args
    r = NameRegister(prefix=opts.prefix, pad0=opts.pad0, uc=opts.uc)
    r.get_blacklist(blacklist)
    r.get_gaps(gapsbed)

    # Run through the bed, identify stretch of NEW ids to instantiate,
    # identify the flanking FRAMEs, interpolate!
    bed = Bed(taggedbed)
    outputbed = taggedbed.rsplit(".", 1)[0] + ".new.bed"
    fw = open(outputbed, "w")

    tagkey = lambda x: x.rsplit("|", 1)[-1]
    for chr, sbed in bed.sub_beds():
        current_chr = chr_number(chr)
        if not current_chr:
            continue

        sbed = list(sbed)

        ranks = []
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            tag = tagkey(nametag)

            if tag in (NEW, FRAME):
                ranks.append((i, nametag))

        blocks = []
        for tag, names in groupby(ranks, key=lambda x: tagkey(x[-1])):
            names = list(names)
            if tag == NEW:
                blocks.append((tag, [sbed[x[0]] for x in names]))
            else:
                start, end = names[0][-1], names[-1][-1]
                start, end = (
                    atg_name(start, retval="rank"),
                    atg_name(end, retval="rank"),
                )
                blocks.append((tag, [start, end]))

        id_table = {}  # old to new name conversion
        for i, (tag, info) in enumerate(blocks):
            if tag != NEW:
                continue

            start_id = 0 if i == 0 else blocks[i - 1][1][-1]
            end_id = start_id + 10000 if i == len(blocks) - 1 else blocks[
                i + 1][1][0]

            r.allocate(
                info,
                chr,
                start_id,
                end_id,
                id_table,
                extended_stride=opts.extended_stride,
            )

        # Output new names
        for i, s in enumerate(sbed):
            nametag = s.extra[0]
            name, tag = nametag.split("|")

            if tag == NEW:
                assert name == "."
                name = id_table[s.accn]
            elif tag == OVERLAP:
                if name in id_table:
                    name = id_table[name]

            s.extra[0] = "|".join((name, tag))
            print(s, file=fw)

    fw.close()
Example #55
0
File: gaps.py Project: zjwang6/jcvi
def sizes(args):
    """
    %prog sizes gaps.bed a.fasta b.fasta

    Take the flanks of gaps within a.fasta, map them onto b.fasta. Compile the
    results to the gap size estimates in b. The output is detailed below:

    Columns are:
    1.  A scaffold
    2.  Start position
    3.  End position
    4.  Gap identifier
    5.  Gap size in A (= End - Start)
    6.  Gap size in B (based on BLAST, see below)

    For each gap, I extracted the left and right sequence (mostly 2Kb, but can be shorter
    if it runs into another gap) flanking the gap. The flanker names look like gap.00003L
    and gap.00003R means the left and right flanker of this particular gap, respectively.

    The BLAST output is used to calculate the gap size. For each flanker sequence, I took
    the best hit, and calculate the inner distance between the L match range and R range.
    The two flankers must map with at least 98% identity, and in the same orientation.

    NOTE the sixth column in the list file is not always a valid number. Other values are:
    -   na: both flankers are missing in B
    -   Singleton: one flanker is missing
    -   Different chr: flankers map to different scaffolds
    -   Strand +|-: flankers map in different orientations
    -   Negative value: the R flanker map before L flanker
    """
    from jcvi.formats.base import DictFile
    from jcvi.apps.align import blast

    p = OptionParser(sizes.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    gapsbed, afasta, bfasta = args
    pf = gapsbed.rsplit(".", 1)[0]
    extbed = pf + ".ext.bed"
    extfasta = pf + ".ext.fasta"

    if need_update(gapsbed, extfasta):
        extbed, extfasta = flanks([gapsbed, afasta])

    q = op.basename(extfasta).split(".")[0]
    r = op.basename(bfasta).split(".")[0]
    blastfile = "{0}.{1}.blast".format(q, r)

    if need_update([extfasta, bfasta], blastfile):
        blastfile = blast([bfasta, extfasta, "--wordsize=50", "--pctid=98"])

    labelsfile = blast_to_twobeds(blastfile)
    labels = DictFile(labelsfile, delimiter='\t')
    bed = Bed(gapsbed)
    for b in bed:
        b.score = b.span
        accn = b.accn
        print("\t".join((str(x)
                         for x in (b.seqid, b.start - 1, b.end, accn, b.score,
                                   labels.get(accn, "na")))))
Example #56
0
def tRNAscan(args):
    """
    %prog tRNAscan all.trna > all.trna.gff3

    Convert tRNAscan-SE output into gff3 format.

    Sequence                tRNA            Bounds          tRNA    Anti    Intron Bounds   Cove
    Name            tRNA #  Begin           End             Type    Codon   Begin   End     Score
    --------        ------  ----            ------          ----    -----   -----   ----    ------
    23231           1       335355          335440          Tyr     GTA     335392  335404  69.21
    23231           2       1076190         1076270         Leu     AAG     0       0       66.33

    Conversion based on PERL one-liner in:
    <https://github.com/sujaikumar/assemblage/blob/master/README-annotation.md>
    """
    from jcvi.formats.gff import sort

    p = OptionParser(tRNAscan.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (trnaout, ) = args
    gffout = trnaout + ".gff3"
    fp = open(trnaout)
    fw = open(gffout, "w")

    next(fp)
    next(fp)
    row = next(fp)
    assert row.startswith("--------")

    for row in fp:
        atoms = [x.strip() for x in row.split("\t")]
        contig, trnanum, start, end, aa, codon, intron_start, intron_end, score = atoms

        start, end = int(start), int(end)
        orientation = "+"
        if start > end:
            start, end = end, start
            orientation = "-"

        source = "tRNAscan"
        type = "tRNA"
        if codon == "???":
            codon = "XXX"

        comment = "ID={0}.tRNA.{1};Name=tRNA-{2} (anticodon: {3})".format(
            contig, trnanum, aa, codon)

        print(
            "\t".join(
                str(x) for x in (
                    contig,
                    source,
                    type,
                    start,
                    end,
                    score,
                    orientation,
                    ".",
                    comment,
                )),
            file=fw,
        )

    fw.close()
    sort([gffout, "-i"])
Example #57
0
def prepare(args):
    """
    %prog prepare mcscanfile cdsfile [options]

    Pick sequences from cdsfile to form fasta files, according to multiple
    alignment in the mcscanfile.
    The fasta sequences can then be used to construct phylogenetic tree.

    Use --addtandem=tandemfile to collapse tandems of anchors into single row.
    The tandemfile must be provided with *ALL* genomes involved, otherwise
    result will be incomplete and redundant.
    """
    from jcvi.graphics.base import discrete_rainbow

    p = OptionParser(prepare.__doc__)
    p.add_option("--addtandem", help="path to tandemfile")
    p.add_option(
        "--writecolors",
        default=False,
        action="store_true",
        help="generate a gene_name to color mapping file which will be taken "
        "by jcvi.apps.phylo.draw",
    )
    p.set_outdir(outdir="sequences")

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    mcscanfile, cdsfile = args

    if opts.addtandem:
        tandemfile = opts.addtandem
        mcscanfile_with_tandems = add_tandems(mcscanfile, tandemfile)
        mcscanfile = mcscanfile_with_tandems

    seqdir = opts.outdir
    mkdir(seqdir)
    f = Fasta(cdsfile)
    fp = must_open(mcscanfile)
    if opts.writecolors:
        fc = must_open("leafcolors.txt", "w")

    n = 0
    for i, row in enumerate(fp):
        row = row.strip().split("\t")
        if i == 0:
            l = len(row)
            if l <= 20:
                colors = discrete_rainbow(l, shuffle=False)[1]
            else:
                colors = discrete_rainbow(l, usepreset=False, shuffle=False)[1]
                warnings.warn(
                    "*** WARNING ***\n"
                    "Too many columns. Colors may not be all distinctive.")

        assert len(row) == l, "All rows should have same number of fields."

        anchors = set()
        for j, atom in enumerate(row):
            color = "%s,%s,%s" % colors[j]
            if atom == ".":
                continue
            elif "," in atom:
                atom = atom.split(",")
                for a in atom:
                    fc.write("{0}\t{1}\n".format(a, color))
                    anchors.add(a)
            else:
                fc.write("{0}\t{1}\n".format(atom, color))
                anchors.add(atom)

        if len(anchors) <= 3:
            print(
                "Not enough seqs to build trees for {0}".format(anchors),
                file=sys.stderr,
            )
            continue

        pivot = row[0]
        fw = must_open("%s/%s.cds" % (seqdir, pivot), "w")
        for a in anchors:
            if a not in f:
                print(a)
                a = find_first_isoform(a, f)
                assert a, a
            arec = f[a]
            SeqIO.write((arec), fw, "fasta")
        fw.close()
        n += 1

    if opts.writecolors:
        fc.close()
        logging.debug("leaf colors written to `{0}`".format(fc.name))

    logging.debug("cds of {0} syntelog groups written to {1}/".format(
        n, seqdir))

    return seqdir
Example #58
0
File: gaps.py Project: zjwang6/jcvi
def annotate(args):
    """
    %prog annotate agpfile gaps.linkage.bed assembly.fasta

    Annotate AGP file with linkage info of `paired-end` or `map`.
    File `gaps.linkage.bed` is generated by assembly.gaps.estimate().
    """
    from jcvi.formats.agp import AGP, bed, tidy

    p = OptionParser(annotate.__doc__)
    p.add_option("--minsize",
                 default=200,
                 help="Smallest component size [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, linkagebed, assemblyfasta = args
    linkagebed = Bed(linkagebed)
    spannedgaps = set()
    for b in linkagebed:
        score = int(b.score)
        if score == 0:
            spannedgaps.add((b.accn, b.start, b.end))

    agp = AGP(agpfile)
    newagpfile = agpfile.rsplit(".", 1)[0] + ".linkage.agp"
    newagp = open(newagpfile, "w")
    contig_id = 0
    minsize = opts.minsize
    for a in agp:
        if not a.is_gap:
            cs = a.component_span
            if cs < minsize:
                a.is_gap = True
                a.component_type = "N"
                a.gap_length = cs
                a.gap_type = "scaffold"
                a.linkage = "yes"
                a.linkage_evidence = []
            else:
                contig_id += 1
                a.component_id = "contig{0:04d}".format(contig_id)
                a.component_beg = 1
                a.component_end = cs
                a.component_type = "W"

            print(a, file=newagp)
            continue

        gapinfo = (a.object, a.object_beg, a.object_end)
        gaplen = a.gap_length

        if gaplen == 100 and gapinfo not in spannedgaps:
            a.component_type = "U"
            tag = "map"
        else:
            tag = "paired-ends"

        a.linkage_evidence.append(tag)
        print(a, file=newagp)

    newagp.close()
    logging.debug("Annotated AGP written to `{0}`.".format(newagpfile))

    contigbed = assemblyfasta.rsplit(".", 1)[0] + ".contigs.bed"
    bedfile = bed([newagpfile, "--nogaps", "--outfile=" + contigbed])

    contigfasta = fastaFromBed(bedfile,
                               assemblyfasta,
                               name=True,
                               stranded=True)

    tidy([newagpfile, contigfasta])
Example #59
0
def draw(args):
    """
    %prog draw --input newicktrees [options]

    Draw phylogenetic trees into single or combined plots.
    Input trees should be one of the following:
    1.  single Newick format tree file
    2.  a dir containing *ONLY* the tree files to be drawn

    Newick format:
    http://evolution.genetics.washington.edu/phylip/newicktree.html

    This function wraps on jcvi.graphics.tree
    This function is better used for trees generated by jcvi.apps.phylo (rooted
    if possible). For drawing general Newick trees from external sources invoke
    jcvi.graphics.tree directly, which also gives more drawing options.
    """
    trunc_name_options = ["headn", "oheadn", "tailn", "otailn"]
    p = OptionParser(draw.__doc__)
    p.add_option(
        "--input",
        help="path to single input tree file or a dir "
        "containing ONLY the input tree files",
    )
    p.add_option(
        "--combine",
        type="string",
        default="1x1",
        help="combine multiple trees into one plot in nrowxncol",
    )
    p.add_option(
        "--trunc_name",
        default=None,
        help="Options are: {0}. "
        "truncate first n chars, retains only first n chars, "
        "truncate last n chars, retain only last chars. "
        "n=1~99.".format(trunc_name_options),
    )
    p.add_option(
        "--SH",
        default=None,
        help="path to a file containing SH test p-values in format:"
        "tree_file_name<tab>p-values "
        "This file can be generated with jcvi.apps.phylo build",
    )
    p.add_option(
        "--scutoff",
        default=50,
        type="int",
        help="cutoff for displaying node support, 0-100",
    )
    p.add_option(
        "--barcode",
        default=None,
        help="path to seq/taxon name barcode mapping file: "
        "barcode<tab>new_name "
        "This option is downstream of `--trunc_name`",
    )
    p.add_option(
        "--leafcolorfile",
        default=None,
        help="path to a mapping file containing font colors "
        "for the OTUs: leafname<tab>color",
    )
    p.set_outdir()
    opts, args, iopts = p.set_image_options(figsize="8x6")
    input = opts.input
    outdir = opts.outdir
    combine = opts.combine.split("x")
    trunc_name = opts.trunc_name
    SH = opts.SH

    mkdir(outdir)
    if not input:
        sys.exit(not p.print_help())
    elif op.isfile(input):
        trees_file = input
        treenames = [op.basename(input)]
    elif op.isdir(input):
        trees_file = op.join(outdir, "alltrees.dnd")
        treenames = []
        for f in sorted(os.listdir(input)):
            sh("cat {0}/{1} >> {2}".format(input, f, trees_file), log=False)
            treenames.append(f)
    else:
        sys.exit(not p.print_help())

    trees = OrderedDict()
    tree = ""
    i = 0
    for row in LineFile(trees_file, comment="#", load=True).lines:
        if i == len(treenames):
            break
        if not len(row):
            continue

        if ";" in row:
            # sanity check
            if row.index(";") != len(row) - 1:
                ts = row.split(";")
                for ii in range(len(ts) - 1):
                    ts[ii] += ";"
            else:
                ts = [row]
            for t in ts:
                if ";" in t:
                    tree += t
                    if tree:
                        trees[treenames[i]] = tree
                        tree = ""
                        i += 1
                else:
                    tree += t
        else:
            tree += row

    logging.debug("A total of {0} trees imported.".format(len(trees)))
    sh("rm {0}".format(op.join(outdir, "alltrees.dnd")))

    _draw_trees(
        trees,
        nrow=int(combine[0]),
        ncol=int(combine[1]),
        rmargin=0.3,
        iopts=iopts,
        outdir=outdir,
        shfile=SH,
        trunc_name=trunc_name,
        scutoff=opts.scutoff,
        barcodefile=opts.barcode,
        leafcolorfile=opts.leafcolorfile,
    )
Example #60
0
def build(args):
    """
    %prog build [prot.fasta] cds.fasta [options] --outdir=outdir

    This function wraps on the following steps:
    1. msa using ClustalW2 or MUSCLE(default)
    2. (optional) alignment editing using Gblocks
    3. build NJ tree using PHYLIP in EMBOSS package
       seq names should be unique by first 10 chars (restriction of PHYLIP)
    4. build ML tree using RAxML(default) or PHYML, use keywords raxml or phyml,
       *WARNING* maybe slow with large dataset

    If an outgroup file is provided, the result tree will be rooted on the
    outgroup according to order in the file, i.e. the name in row1 will be
    tried first. If not found, row2 will be used, etc.
    Tail truncated names can be provided so long as it is unique among the seqs.
    If not uniq, the first occurrence will be used. For example, if you have
    two moss sequences in your input, then the tree will be rooted on the
    first moss sequence encountered by the program, unless they are monophylic,
     in which case the root will be their common ancestor.

    --stree and --smap are required if --treefix is set.

    Trees can be edited again using an editor such as Dendroscope. This
    is the recommended way to get highly customized trees.

    Newick format trees will be deposited into outdir (. by default).
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(build.__doc__)
    p.add_option(
        "--longest",
        action="store_true",
        help="Get longest ORF, only works if no pep file, e.g. ESTs",
    )
    p.add_option(
        "--nogblocks",
        action="store_true",
        help="don't use Gblocks to edit alignment",
    )
    p.add_option(
        "--synonymous",
        action="store_true",
        help="extract synonymous sites of the alignment",
    )
    p.add_option(
        "--fourfold",
        action="store_true",
        help="extract fourfold degenerate sites of the alignment",
    )
    p.add_option(
        "--msa",
        default="muscle",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins",
    )
    p.add_option(
        "--noneighbor",
        action="store_true",
        help="don't build NJ tree",
    )
    p.add_option(
        "--ml",
        default=None,
        choices=("raxml", "phyml"),
        help="software used to build ML tree",
    )
    p.add_option("--outgroup", help="path to file containing outgroup orders")
    p.add_option("--SH", help="path to reference Newick tree")
    p.add_option("--shout", default="SH_out.txt", help="SH output file name")
    p.add_option(
        "--treefix",
        action="store_true",
        help="use TreeFix to rearrange ML tree",
    )
    p.add_option("--stree", help="path to species Newick tree")
    p.add_option(
        "--smap",
        help="path to smap file: gene_name_pattern<tab>species_name",
    )
    p.set_outdir()

    opts, args = p.parse_args(args)
    gblocks = not opts.nogblocks
    synonymous = opts.synonymous
    fourfold = opts.fourfold
    neighbor = not opts.noneighbor
    outgroup = opts.outgroup
    outdir = opts.outdir

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    if opts.treefix:
        stree = opts.stree
        smap = opts.smap
        assert stree and smap, "TreeFix requires stree and smap files."
        opts.ml = "raxml"

    treedir = op.join(outdir, "tree")
    mkdir(treedir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    work_dir = op.join(outdir, "alignment")
    mkdir(work_dir)
    p_recs = list(SeqIO.parse(open(protein_file), "fasta"))
    if opts.msa == "clustalw":
        align_fasta = clustal_align_protein(p_recs, work_dir)
    elif opts.msa == "muscle":
        align_fasta = muscle_align_protein(p_recs, work_dir)

    n_recs = list(SeqIO.parse(open(dna_file), "fasta"))
    mrtrans_fasta = run_mrtrans(align_fasta, n_recs, work_dir, outfmt="fasta")

    if not mrtrans_fasta:
        logging.debug("pal2nal aborted. Cannot reliably build tree for %s",
                      dna_file)
        return

    codon_aln_fasta = mrtrans_fasta
    if gblocks:
        gb_fasta = run_gblocks(mrtrans_fasta)
        codon_aln_fasta = gb_fasta if gb_fasta else codon_aln_fasta

    else:
        if synonymous:
            codon_aln_fasta = subalignment(mrtrans_fasta, "synonymous")

        if fourfold:
            codon_aln_fasta = subalignment(mrtrans_fasta, "fourfold")

    if not neighbor and not opts.ml:
        return codon_aln_fasta

    alignment = AlignIO.read(codon_aln_fasta, "fasta")
    if len(alignment) <= 3:
        raise ValueError("Too few seqs to build tree.")

    mkdir(op.join(treedir, "work"))
    if neighbor:
        out_file = op.join(
            treedir,
            op.basename(dna_file).rsplit(".", 1)[0] + ".NJ.unrooted.dnd")
        try:
            outfile, phy_file = build_nj_phylip(alignment,
                                                outfile=out_file,
                                                outgroup=outgroup,
                                                work_dir=treedir)
        except:
            print("NJ tree cannot be built for {0}".format(dna_file))

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

    if opts.ml:
        out_file = op.join(
            treedir,
            op.basename(dna_file).rsplit(".", 1)[0] + ".ML.unrooted.dnd")

        if opts.ml == "phyml":
            try:
                outfile, phy_file = build_ml_phyml(alignment,
                                                   outfile=out_file,
                                                   work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        elif opts.ml == "raxml":
            try:
                outfile, phy_file = build_ml_raxml(alignment,
                                                   outfile=out_file,
                                                   work_dir=treedir)
            except:
                print("ML tree cannot be built for {0}".format(dna_file))

        if outgroup:
            new_out_file = out_file.replace(".unrooted", "")
            t = smart_reroot(treefile=out_file,
                             outgroupfile=outgroup,
                             outfile=new_out_file)
            if t == new_out_file:
                sh("rm %s" % out_file)
                outfile = new_out_file

        if opts.SH:
            reftree = opts.SH
            querytree = outfile
            SH_raxml(reftree, querytree, phy_file, shout=opts.shout)

        if opts.treefix:
            treefix_dir = op.join(treedir, "treefix")
            assert mkdir(treefix_dir, overwrite=True)

            sh("cp {0} {1}/".format(outfile, treefix_dir))
            input = op.join(treefix_dir, op.basename(outfile))
            aln_file = input.rsplit(".", 1)[0] + ".fasta"
            SeqIO.write(alignment, aln_file, "fasta")

            outfile = run_treefix(
                input=input,
                stree_file=stree,
                smap_file=smap,
                a_ext=".fasta",
                o_ext=".dnd",
                n_ext=".treefix.dnd",
            )

    return outfile