Example #1
0
def genestatus(args):
    """
    %prog genestatus diploid.gff3.exon.ids

    Tag genes based on translation from GMAP models, using fasta.translate()
    --ids.
    """
    p = OptionParser(genestatus.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    idsfile, = args
    data = get_tags(idsfile)
    key = lambda x: x[0].split(".")[0]
    for gene, cc in groupby(data, key=key):
        cc = list(cc)
        tags = [x[-1] for x in cc]
        if "complete" in tags:
            tag = "complete"
        elif "partial" in tags:
            tag = "partial"
        else:
            tag = "pseudogene"
        print "\t".join((gene, tag))
Example #2
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Example #3
0
File: age.py Project: xuanblo/jcvi
def traits(args):
    """
    %prog traits directory

    Make HTML page that reports eye and skin color.
    """
    p = OptionParser(traits.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    samples = []
    for folder in args:
        targets = iglob(folder, "*-traits.json")
        if not targets:
            continue
        filename = targets[0]
        js = json.load(open(filename))
        js["skin_rgb"] = make_rgb(
            js["traits"]["skin-color"]["L"],
            js["traits"]["skin-color"]["A"],
            js["traits"]["skin-color"]["B"])
        js["eye_rgb"] = make_rgb(
            js["traits"]["eye-color"]["L"],
            js["traits"]["eye-color"]["A"],
            js["traits"]["eye-color"]["B"])
        samples.append(js)

    template = Template(traits_template)
    fw = open("report.html", "w")
    print >> fw, template.render(samples=samples)
    logging.debug("Report written to `{}`".format(fw.name))
    fw.close()
Example #4
0
def diff(args):
    """
    %prog diff simplefile

    Calculate difference of pairwise syntenic regions.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(diff.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    simplefile, = args
    fp = open(simplefile)
    data = [x.split() for x in fp]
    spans = []
    for block_id, ab in groupby(data[1:], key=lambda x: x[0]):
        a, b = list(ab)
        aspan, bspan = a[4], b[4]
        aspan, bspan = int(aspan), int(bspan)
        spans.append((aspan, bspan))
    aspans, bspans = zip(*spans)
    dspans = [b - a for a, b, in spans]
    s = SummaryStats(dspans)
    print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans))
    print >> sys.stderr, "Sum of A: {0}".format(sum(aspans))
    print >> sys.stderr, "Sum of B: {0}".format(sum(bspans))
    print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
Example #5
0
File: age.py Project: xuanblo/jcvi
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Example #6
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
Example #7
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Example #8
0
def summary(args):
    """
    %prog summary *.gff

    Print gene statistics table.
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    for metric in metrics:
        logging.debug("Parsing files in `{0}`..".format(metric))

        table = {}
        for x in gff_files:
            pf = op.basename(x).split(".")[0]
            numberfile = op.join(metric, pf + ".txt")
            ar = [int(x.strip()) for x in open(numberfile)]
            sum = SummaryStats(ar).todict().items()
            keys, vals = zip(*sum)
            keys = [(pf, x) for x in keys]
            table.update(dict(zip(keys, vals)))

        print >> sys.stderr, tabulate(table)
Example #9
0
def histogram(args):
    """
    %prog histogram *.gff

    Plot gene statistics based on output of stats. For each gff file, look to
    see if the metrics folder (i.e. Exon_Length) contains the data and plot
    them.
    """
    from jcvi.graphics.histogram import histogram_multiple

    p = OptionParser(histogram.__doc__)
    p.add_option("--bins", dest="bins", default=40, type="int",
            help="number of bins to plot in the histogram [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count")
    colors = ("red", "green", "blue", "black")
    vmaxes = (1000, 1000, 4000, 20)
    xlabels = ("bp", "bp", "bp", "number")
    for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels):
        logging.debug("Parsing files in `{0}`..".format(metric))
        numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \
                        for x in gff_files]

        histogram_multiple(numberfiles, 0, vmax, xlabel, metric,
                       bins=opts.bins, facet=True, fill=color,
                       prefix=metric + ".")
Example #10
0
File: ca.py Project: arvin580/jcvi
def unitigs(args):
    """
    %prog unitigs best.edges

    Reads Celera Assembler's "best.edges" and extract all unitigs.
    """
    p = OptionParser(unitigs.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    G = read_graph(bestedges, maxerr=opts.maxerr, directed=True)
    H = nx.Graph()
    intconv = lambda x: int(x.split("-")[0])
    for k, v in G.iteritems():
        if k == G.get(v, None):
            H.add_edge(intconv(k), intconv(v))

    nunitigs = nreads = 0
    for h in nx.connected_component_subgraphs(H, copy=False):
        st = [x for x in h if h.degree(x) == 1]
        if len(st) != 2:
            continue
        src, target = st
        path = list(nx.all_simple_paths(h, src, target))
        assert len(path) == 1
        path, = path
        print "|".join(str(x) for x in path)
        nunitigs += 1
        nreads += len(path)
    logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
Example #11
0
File: ca.py Project: arvin580/jcvi
def tracedb(args):
    """
    %prog tracedb <xml|lib|frg>

    Run `tracedb-to-frg.pl` within current folder.
    """
    p = OptionParser(tracedb.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    action, = args
    assert action in ("xml", "lib", "frg")

    CMD = "tracedb-to-frg.pl"
    xmls = glob("xml*")

    if action == "xml":
        for xml in xmls:
            cmd = CMD + " -xml {0}".format(xml)
            sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True)

    elif action == "lib":
        cmd = CMD + " -lib {0}".format(" ".join(xmls))
        sh(cmd)

    elif action == "frg":
        for xml in xmls:
            cmd = CMD + " -frg {0}".format(xml)
            sh(cmd, background=True)
Example #12
0
def ids(args):
    """
    %prog ids cdhit.clstr

    Get the representative ids from clstr file.
    """
    p = OptionParser(ids.__doc__)
    p.add_option("--prefix", type="int",
                 help="Find rep id for prefix of len [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    cf = ClstrFile(clstrfile)
    prefix = opts.prefix
    if prefix:
        reads = list(cf.iter_reps_prefix(prefix=prefix))
    else:
        reads = list(cf.iter_reps())

    nreads = len(reads)
    idsfile = clstrfile.replace(".clstr", ".ids")
    fw = open(idsfile, "w")
    for i, name in reads:
        print("\t".join(str(x) for x in (i, name)), file=fw)

    logging.debug("A total of {0} unique reads written to `{1}`.".\
            format(nreads, idsfile))
    fw.close()

    return idsfile
Example #13
0
def csv(args):
    """
    %prog csv excelfile

    Convert EXCEL to csv file.
    """
    from xlrd import open_workbook

    p = OptionParser(csv.__doc__)
    p.set_sep(sep=',')
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    excelfile, = args
    sep = opts.sep
    csvfile = excelfile.rsplit(".", 1)[0] + ".csv"
    wb = open_workbook(excelfile)
    fw = open(csvfile, "w")
    for s in wb.sheets():
        print >> sys.stderr, 'Sheet:',s.name
        for row in range(s.nrows):
            values = []
            for col in range(s.ncols):
                values.append(s.cell(row, col).value)
            print >> fw, sep.join(str(x) for x in values)
Example #14
0
def main():
    """
    %prog numbers1.txt number2.txt ...

    Print histogram of the data files. The data files contain one number per
    line. If more than one file is inputted, the program will combine the
    histograms into the same plot.
    """
    allowed_format = ("emf", "eps", "pdf", "png", "ps", \
                      "raw", "rgba", "svg", "svgz")
    p = OptionParser(main.__doc__)
    p.add_option("--skip", default=0, type="int",
            help="skip the first several lines [default: %default]")
    p.set_histogram()
    p.add_option("--tags", dest="tags", default=None,
            help="tags for data if multiple input files, comma sep")
    p.add_option("--ascii", default=False, action="store_true",
            help="print ASCII text stem-leaf plot [default: %default]")
    p.add_option("--base", default="0", choices=("0", "2", "10"),
            help="use logarithm axis with base, 0 to disable [default: %default]")
    p.add_option("--facet", default=False, action="store_true",
            help="place multiple histograms side-by-side [default: %default]")
    p.add_option("--fill", default="white",
            help="color of the bin [default: %default]")
    p.add_option("--format", default="pdf", choices=allowed_format,
            help="Generate image of format [default: %default]")
    p.add_option("--quick", default=False, action="store_true",
            help="Use quick plot, assuming bins are already counted")
    p.add_option("--noprintstats", default=False, action="store_true",
            help="Write basic stats when using --quick")
    opts, args = p.parse_args()

    if len(args) < 1:
        sys.exit(not p.print_help())

    skip = opts.skip
    vmin, vmax = opts.vmin, opts.vmax
    bins = opts.bins
    xlabel, title = opts.xlabel, opts.title
    title = title or args[0]
    base = int(opts.base)
    fileno = len(args)

    if opts.quick:
        assert fileno == 1, "Single input file expected using --quick"
        filename = args[0]
        figname = filename.rsplit(".", 1)[0] + ".pdf"
        data = DictFile(filename, keycast=int, cast=int)
        quickplot(data, vmin, vmax, xlabel, title, figname=figname,
                  print_stats=(not opts.noprintstats))
        return

    if fileno == 1:
        histogram(args[0], vmin, vmax, xlabel, title, outfmt=opts.format,
                bins=bins, skip=skip, ascii=opts.ascii,
                base=base, fill=opts.fill)
    else:
        histogram_multiple(args, vmin, vmax, xlabel, title, outfmt=opts.format,
                tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii,
                facet=opts.facet, fill=opts.fill)
Example #15
0
File: align.py Project: rrane/jcvi
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=None, evalue=.01)
    p.add_option("--wordsize", type="int", help="Word size [default: %default]")
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.add_option("--task", default="megablast", choices=task_choices,
            help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    q = op.basename(queryfasta).split(".")[0]
    r = op.basename(reffasta).split(".")[0]
    blastfile = "{0}.{1}.blast".format(q, r)

    run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta,
                  wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue,
                  hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus)

    return blastfile
Example #16
0
def passthrough(args):
    """
    %prog passthrough chrY.vcf chrY.new.vcf

    Pass through Y and MT vcf.
    """
    p = OptionParser(passthrough.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, newvcffile = args
    fp = open(vcffile)
    fw = open(newvcffile, "w")
    gg = ["0/0", "0/1", "1/1"]
    for row in fp:
        if row[0] == "#":
            print(row.strip(), file=fw)
            continue

        v = VcfLine(row)
        v.filter = "PASS"
        v.format = "GT:GP"
        probs = [0] * 3
        probs[gg.index(v.genotype)] = 1
        v.genotype = v.genotype.replace("/", "|") + \
                ":{0}".format(",".join("{0:.3f}".format(x) for x in probs))
        print(v, file=fw)
    fw.close()
Example #17
0
def agp(args):
    """
    %prog agp <fastafile|sizesfile>

    Convert the sizes file to a trivial AGP file.
    """
    from jcvi.formats.agp import OO

    p = OptionParser(agp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    sizesfile, = args
    sizes = Sizes(sizesfile)
    agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp"
    fw = open(agpfile, "w")
    o = OO()  # Without a filename
    for ctg, size in sizes.iter_sizes():
        o.add(ctg, ctg, size)

    o.write_AGP(fw)
    fw.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))

    return agpfile
Example #18
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Example #19
0
def beagle(args):
    """
    %prog beagle input.vcf 1

    Use BEAGLE4.1 to impute vcf on chromosome 1.
    """
    p = OptionParser(beagle.__doc__)
    p.set_home("beagle")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, chr = args
    pf = vcffile.rsplit(".", 1)[0]
    outpf = pf + ".beagle"
    outfile = outpf + ".vcf.gz"

    mm = MakeManager()
    beagle_cmd = opts.beagle_home
    kg = op.join(opts.ref, "1000GP_Phase3")
    cmd = beagle_cmd + " gt={0}".format(vcffile)
    cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr)
    cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr)
    cmd += " out={0}".format(outpf)
    cmd += " nthreads=16 gprobs=true"
    mm.add(vcffile, outfile, cmd)

    mm.write()
Example #20
0
File: tgbs.py Project: fw1121/jcvi
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Example #21
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Example #22
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Example #23
0
File: vcf.py Project: Hensonmw/jcvi
def fromimpute2(args):
    """
    %prog fromimpute2 impute2file fastafile 1

    Convert impute2 output to vcf file. Imputed file looks like:

    --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002
    """
    p = OptionParser(fromimpute2.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    impute2file, fastafile, chr = args
    fasta = Fasta(fastafile)
    print get_vcfstanza(fastafile, fasta)
    fp = open(impute2file)
    seen = set()
    for row in fp:
        snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split()
        pos = int(pos)
        if pos in seen:
            continue
        seen.add(pos)
        code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1]
        tag = "PR" if snp_id == chr else "IM"
        print "\t".join(str(x) for x in \
                (chr, pos, rsid, ref, alt, ".", ".", tag, \
                "GT:GP", code + ":" + ",".join((aa, ab, bb))))
Example #24
0
File: vcf.py Project: Hensonmw/jcvi
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print row.strip()
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print vv[0]
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print bestv
Example #25
0
File: vcf.py Project: Hensonmw/jcvi
def sample(args):
    """
    %prog sample vcffile 0.9

    Sample subset of vcf file.
    """
    from random import random

    p = OptionParser(sample.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, ratio = args
    ratio = float(ratio)
    fp = open(vcffile)
    pf = vcffile.rsplit(".", 1)[0]
    kept = pf + ".kept.vcf"
    withheld = pf + ".withheld.vcf"
    fwk = open(kept, "w")
    fww = open(withheld, "w")
    nkept = nwithheld = 0
    for row in fp:
        if row[0] == '#':
            print >> fwk, row.strip()
            continue
        if random() < ratio:
            nkept += 1
            print >> fwk, row.strip()
        else:
            nwithheld += 1
            print >> fww, row.strip()
    logging.debug("{0} records kept to `{1}`".format(nkept, kept))
    logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
Example #26
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Example #27
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Example #28
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Example #29
0
def fromagp(args):
    """
    %prog fromagp agpfile componentfasta objectfasta

    Generate chain file from AGP format. The components represent the old
    genome (target) and the objects represent new genome (query).
    """
    from jcvi.formats.agp import AGP
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fromagp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, componentfasta, objectfasta = args
    chainfile = agpfile.rsplit(".", 1)[0] + ".chain"
    fw = open(chainfile, "w")
    agp = AGP(agpfile)
    componentsizes = Sizes(componentfasta).mapping
    objectsizes = Sizes(objectfasta).mapping
    chain = "chain"
    score = 1000
    tStrand = "+"
    id = 0
    for a in agp:
        if a.is_gap:
            continue

        tName = a.component_id
        tSize = componentsizes[tName]
        tStart = a.component_beg
        tEnd = a.component_end
        tStart -= 1

        qName = a.object
        qSize = objectsizes[qName]
        qStrand = "-" if a.orientation == "-" else "+"
        qStart = a.object_beg
        qEnd = a.object_end
        if qStrand == '-':
            _qStart = qSize - qEnd + 1
            _qEnd = qSize - qStart + 1
            qStart, qEnd = _qStart, _qEnd
        qStart -= 1

        id += 1
        size = a.object_span
        headerline = "\t".join(str(x) for x in (
             chain, score, tName, tSize, tStrand, tStart,
             tEnd, qName, qSize, qStrand, qStart, qEnd, id
        ))
        alignmentline = size
        print >> fw, headerline
        print >> fw, alignmentline
        print >> fw

    fw.close()
    logging.debug("File written to `{0}`.".format(chainfile))
Example #30
0
File: tgbs.py Project: fw1121/jcvi
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Example #31
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob
    cparams = "1 1 2 80 5 200 2000"

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--lobstr",
                 default=False,
                 action="store_true",
                 help="Generate output for lobSTR")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    p.add_option("--centromeres",
                 default=False,
                 action="store_true",
                 help="Run centromere search: {}".format(cparams))
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minscore / 2
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    if opts.centromeres:
        params = cparams.split()

    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | grep -v ^Parameters".format(datfile)
        if opts.lobstr:
            cmd2 += " | awk '($8 >= {} && $8 <= {})'".\
                    format(minlength, READLEN - minlength)
        else:
            cmd2 += " | awk '($8 >= 0)'"
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Example #32
0
def from23andme(args):
    """
    %prog from23andme txtfile 1

    Convert from23andme file to vcf file.

    --ref points to the folder that contains chr1.rsids

    $ zcat 1000GP_Phase3/1000GP_Phase3_chr1.legend.gz \\
            | cut -d" " -f1 | grep ":" > chr1.rsids
    """
    p = OptionParser(from23andme.__doc__)
    p.set_ref()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    txtfile, seqid = args
    ref_dir = opts.ref
    fastafile = op.join(ref_dir, "hs37d5.fa")
    fasta = Fasta(fastafile)

    pf = txtfile.rsplit(".", 1)[0]
    px = CM[seqid]
    chrvcf = pf + ".{0}.vcf".format(px)
    legend = op.join(ref_dir, "1000GP_Phase3/{0}.rsids".format(px))
    register = read_rsid(seqid, legend)

    fw = open(chrvcf, "w")
    print(get_vcfstanza(fastafile, txtfile), file=fw)

    fp = open(txtfile)
    seen = set()
    duplicates = skipped = missing = 0
    for row in fp:
        if row[0] == "#":
            continue
        rsid, chr, pos, genotype = row.split()
        if chr != seqid:
            continue
        pos = int(pos)
        if (chr, pos) in seen:
            duplicates += 1
            continue
        seen.add((chr, pos))
        genotype = list(genotype)
        if "-" in genotype:  # missing daa
            missing += 1
            continue

        # Y or MT
        if not register:
            assert len(genotype) == 1
            ref = fasta[chr][pos - 1].seq.upper()
            if "D" in genotype or "I" in genotype:
                skipped += 1
                continue
            genotype = genotype[0]
            code = "0/0" if ref == genotype else "1/1"
            alt = "." if ref == genotype else genotype
            print(
                "\t".join(
                    str(x) for x in (chr, pos, rsid, ref, alt, ".", ".", "PR",
                                     "GT", code)),
                file=fw,
            )
            continue

        # If rsid is seen in the db, use that
        if rsid in register:
            pos, ref, alt = register[rsid]
        elif pos in register:
            pos, ref, alt = register[pos]
        else:
            skipped += 1  # Not in reference panel
            continue

        assert fasta[chr][pos - 1:pos + len(ref) - 1].seq.upper() == ref
        # Keep it bi-allelic
        not_seen = [x for x in alt if x not in genotype]
        while len(alt) > 1 and not_seen:
            alt.remove(not_seen.pop())
        if len(alt) > 1:
            alt = [alt[0]]
        alleles = [ref] + alt

        if len(genotype) == 1:
            genotype = [genotype[0]] * 2

        alt = ",".join(alt) or "."
        if "D" in genotype or "I" in genotype:
            max_allele = max((len(x), x) for x in alleles)[1]
            alleles = [("I" if x == max_allele else "D") for x in alleles]
            assert "I" in alleles and "D" in alleles
        a, b = genotype
        try:
            ia, ib = alleles.index(a), alleles.index(b)
        except ValueError:  # alleles not seen
            logging.error("{0}: alleles={1}, genotype={2}".format(
                rsid, alleles, genotype))
            skipped += 1
            continue
        code = "/".join(str(x) for x in sorted((ia, ib)))

        print(
            "\t".join(
                str(x) for x in (chr, pos, rsid, ref, alt, ".", ".", "PR",
                                 "GT", code)),
            file=fw,
        )

    logging.debug("duplicates={0} skipped={1} missing={2}".format(
        duplicates, skipped, missing))
Example #33
0
            logging.error("Term `{0}` does not exist".format(term))
            sys.exit(1)

    if oterm != term:
        logging.debug("Resolved term `{0}` to `{1}`".format(oterm, term))
    return term


if __name__ == "__main__":
    p = OptionParser(__doc__)
    p.add_option(
        "--term",
        help="Write the parents and children of this query term",
    )

    opts, args = p.parse_args()

    if len(args) != 1:
        sys.exit(p.print_help())

    (obo_file, ) = args

    def description(record):
        level = "level-{:>02}".format(record.level)
        desc = "{} [{}]".format(record.name, record.namespace)
        if record.is_obsolete:
            desc += " obsolete"
        alt_ids = ",".join(record.alt_ids)
        return "\t".join((record.item_id, level, desc, alt_ids))

    g = GODag(obo_file, prt=None)
Example #34
0
def mito(args):
    """
    %prog mito chrM.fa input.bam

    Identify mitochondrial deletions.
    """
    p = OptionParser(mito.__doc__)
    p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions")
    p.add_option("--realignonly",
                 default=False,
                 action="store_true",
                 help="Realign only")
    p.add_option(
        "--svonly",
        default=False,
        action="store_true",
        help="Run Realign => SV calls only",
    )
    p.add_option("--support",
                 default=1,
                 type="int",
                 help="Minimum number of supporting reads")
    p.set_home("speedseq", default="/mnt/software/speedseq/bin")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    chrMfa, bamfile = args
    store = opts.output_path
    cleanup = not opts.nocleanup

    if not op.exists(chrMfa):
        logging.debug("File `{}` missing. Exiting.".format(chrMfa))
        return

    chrMfai = chrMfa + ".fai"
    if not op.exists(chrMfai):
        cmd = "samtools index {}".format(chrMfa)
        sh(cmd)

    if not bamfile.endswith(".bam"):
        bamfiles = [x.strip() for x in open(bamfile)]
    else:
        bamfiles = [bamfile]

    if store:
        computed = ls_s3(store)
        computed = [
            op.basename(x).split(".")[0] for x in computed
            if x.endswith(".depth")
        ]
        remaining_samples = [
            x for x in bamfiles if op.basename(x).split(".")[0] not in computed
        ]

        logging.debug("Already computed on `{}`: {}".format(
            store,
            len(bamfiles) - len(remaining_samples)))
        bamfiles = remaining_samples

    logging.debug("Total samples: {}".format(len(bamfiles)))

    for bamfile in bamfiles:
        run_mito(
            chrMfa,
            bamfile,
            opts,
            realignonly=opts.realignonly,
            svonly=opts.svonly,
            store=store,
            cleanup=cleanup,
        )
Example #35
0
def ystr(args):
    """
    %prog ystr chrY.vcf

    Print out Y-STR info given VCF. Marker name extracted from tabfile.
    """
    from jcvi.utils.table import write_csv

    p = OptionParser(ystr.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    si = STRFile(opts.lobstr_home, db="hg38-named")
    register = si.register

    header = "Marker|Reads|Ref|Genotype|Motif".split("|")
    contents = []
    fp = must_open(vcffile)
    reader = vcf.Reader(fp)
    simple_register = {}
    for record in reader:
        name = register[(record.CHROM, record.POS)]
        info = record.INFO
        ref = int(float(info["REF"]))
        rpa = info.get("RPA", ref)
        if isinstance(rpa, list):
            rpa = "|".join(str(int(float(x))) for x in rpa)
        ru = info["RU"]
        simple_register[name] = rpa
        for sample in record.samples:
            contents.append((name, sample["ALLREADS"], ref, rpa, ru))

    # Multi-part markers
    a, b, c = "DYS389I", "DYS389B.1", "DYS389B"
    if a in simple_register and b in simple_register:
        simple_register[c] = int(simple_register[a]) + int(simple_register[b])

    # Multi-copy markers
    mm = ["DYS385", "DYS413", "YCAII"]
    for m in mm:
        ma, mb = m + 'a', m + 'b'
        if ma not in simple_register or mb not in simple_register:
            simple_register[ma] = simple_register[mb] = None
            del simple_register[ma]
            del simple_register[mb]
            continue
        if simple_register[ma] > simple_register[mb]:
            simple_register[ma], simple_register[mb] = \
                    simple_register[mb], simple_register[ma]

    write_csv(header, contents, sep=" ")
    print "[YSEARCH]"
    build_ysearch_link(simple_register)
    print "[YFILER]"
    build_yhrd_link(simple_register, panel=YHRD_YFILER)
    print "[YFILERPLUS]"
    build_yhrd_link(simple_register, panel=YHRD_YFILERPLUS)
    print "[YSTR-ALL]"
    build_yhrd_link(simple_register, panel=USYSTR_ALL)
Example #36
0
def meta(args):
    """
    %prog meta data.bin samples STR.ids STR-exons.wo.bed

    Compute allele frequencies and prune sites based on missingness.

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 694105 /mnt/software/lobSTR/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(meta.__doc__)
    p.add_option("--cutoff",
                 default=.5,
                 type="float",
                 help="Percent observed required (chrY half cutoff)")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 4:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, wobed = args
    cutoff = opts.cutoff

    af_file = "allele_freq"
    if need_update(binfile, af_file):
        df, m, samples, loci = read_binfile(binfile, sampleids, strids)
        nalleles = len(samples)
        fw = must_open(af_file, "w")
        for i, locus in enumerate(loci):
            a = m[:, i]
            counts = alleles_to_counts(a)
            af = counts_to_af(counts)
            seqid = locus.split("_")[0]
            remove = counts_filter(counts, nalleles, seqid, cutoff=cutoff)
            print >> fw, "\t".join((locus, af, remove))
        fw.close()

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        #enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst)

    tredsfile = op.join(datadir, "TREDs.meta.hg38.csv")
    TREDS = read_treds(tredsfile)

    metafile = "STRs_{}_SEARCH.meta.tsv".format(timestamp())
    write_meta(af_file, gene_map, TREDS, filename=metafile)
    logging.debug("File `{}` written.".format(metafile))
Example #37
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.add_option("--chr", help="Run only this chromosome")
    p.set_home("lobstr",
               default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    if lbindices[0] == "TOY":  # Simulation mode
        cmd, vcf_file = allelotype_on_chr(bamfile,
                                          "CHR4",
                                          "/mnt/software/lobSTR/",
                                          "TOY",
                                          haploid=opts.haploid)
        stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats"
        results_dir = "lobstr_results"
        mkdir(results_dir)
        sh(cmd)
        sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file))
        return

    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile,
                                             chr,
                                             lhome,
                                             lbidx,
                                             haploid=opts.haploid)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Example #38
0
def filterloci(args):
    """
    %prog filterloci allele_freq STR-exons.wo.bed SAMPLES

    Filter subset of loci that satisfy:
    1. no redundancy (unique chr:pos)
    2. variable (n_alleles > 1)
    3. low level of missing data (>= 50% autosomal + X, > 25% for Y)

    Write meta file with the following infor:
    1. id
    2. title
    3. gene_name
    4. variant_type
    5. motif
    6. allele_frequency

    `STR-exons.wo.bed` can be generated like this:
    $ tail -n 854476 /mnt/software/lobSTR-4.0.0/hg38/index.tab | cut -f1-3 > all-STR.bed
    $ intersectBed -a all-STR.bed -b all-exons.bed -wo > STR-exons.wo.bed
    """
    p = OptionParser(filterloci.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    af, wobed, samples = args
    nsamples = len([x.strip() for x in open(samples)])
    nalleles = nsamples * 2
    logging.debug("Load {} samples ({} alleles) from `{}`".\
                    format(nsamples, nalleles, samples))

    logging.debug("Load gene intersections from `{}`".format(wobed))
    fp = open(wobed)
    gene_map = defaultdict(set)
    for row in fp:
        chr1, start1, end1, chr2, start2, end2, name, ov = row.split()
        gene_map[(chr1, start1)] |= set(name.split(","))
    for k, v in gene_map.items():
        non_enst = sorted(x for x in v if not x.startswith("ENST"))
        enst = sorted(x.rsplit(".", 1)[0] for x in v if x.startswith("ENST"))
        gene_map[k] = ",".join(non_enst + enst)

    logging.debug("Filtering loci from `{}`".format(af))
    fp = open(af)
    treds = """chr19_45770205_CAG
    chr6_170561926_CAG
    chrX_67545318_CAG
    chr9_69037287_GAA
    chrX_147912051_CGG
    chr4_3074877_CAG
    chrX_148500639_CCG
    chr12_6936729_CAG
    chr13_70139384_CTG
    chr6_16327636_CTG
    chr14_92071011_CTG
    chr12_111598951_CTG
    chr3_63912686_CAG
    chr19_13207859_CTG""".split()
    seen = set(treds)
    remove = []
    fw = open("meta.tsv", "w")
    header = "id title gene_name variant_type motif allele_frequency".\
                replace(" ",  "\t")
    print >> fw, header
    variant_type = "short tandem repeats"
    title = "Short tandem repeats ({})n"
    for row in fp:
        sname, counts = row.split()
        name = sname.rsplit("_", 1)[0]
        seqid, pos, motif = name.split("_")
        countst = [x for x in counts.strip("{}").split(",") if x]
        countsd = {}
        for x in countst:
            a, b = x.split(":")
            countsd[int(a)] = int(b)

        if counts_filter(countsd, nalleles, seqid):
            remove.append(sname)
            continue

        if name in seen:
            remove.append(sname)
            continue
        seen.add(name)

        gene_name = gene_map.get((seqid, pos), "")
        print >> fw, "\t".join((name, title.format(motif), gene_name,
                                variant_type, motif, counts))
    fw.close()

    removeidsfile = "remove.ids"
    fw = open(removeidsfile, "w")
    print >> fw, "\n".join(remove)
    fw.close()
    logging.debug("A total of {} filtered loci written to `{}`".\
                    format(len(remove), removeidsfile))
Example #39
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option(
        "--overwrite",
        default=False,
        action="store_true",
        help="overwrite .rplot file if exists",
    )
    p.add_option(
        "--cutoff",
        default=0,
        type="int",
        dest="cutoff",
        help="use contigs above certain size",
    )
    p.add_option(
        "--stepsize",
        default=10,
        type="int",
        dest="stepsize",
        help="stepsize for the distribution",
    )
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                       "Counts")
        statsrows = []
        print(header, file=fw)
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append(
                (fastafile, l50, n50, cmin, cmax, cmean, csum, counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(
                op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(range(0, len(a50), stepsize), a50[::stepsize]):
                print("\t".join((str(i), str(s / 1000000.0), tag)), file=fw)
        fw.close()

        table = loadtable(statsheader, statsrows)
        print(table, file=sys.stderr)

    generate_plot(rplot)
Example #40
0
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")
Example #41
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """
    from more_itertools import grouper

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join

    p = OptionParser(scaffold.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--conf",
                 help="BAMBUS configuration file [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]",
    )
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    rclip = opts.rclip
    ctgfasta = args[0]
    duos = list(grouper(args[1:], 2))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [
                bedfile,
                "--lib",
                "--nointra",
                "--rclip={0}".format(rclip),
                "--cutoff={0}".format(opts.cutoff),
            ]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".format(
        prefix)
    sh(cmd)

    final = "final"
    cmd = ("printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib "
           "-merge -detail -oo -sum -o {1}".format(prefix, final))
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Example #42
0
def filterdata(args):
    """
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    """
    p = OptionParser(filterdata.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countst = [x for x in counts.strip("{}").split(",") if x]
        countsd = {}
        for x in countst:
            a, b = x.split(":")
            countsd[int(a)] = int(b)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    p = Pool(processes=opts.cpus)
    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
            continue
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():
        res.append(r)
    res.sort()

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    m.tofile(filtered_bin)
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Example #43
0
def subset(args):
    """
    %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks

    Subset some pre-calculated ks ka values (in ksfile) according to pairs
    in tab delimited pairsfile/anchorfile.
    """
    p = OptionParser(subset.__doc__)
    p.add_option(
        "--noheader", action="store_true", help="don't write ksfile header line"
    )
    p.add_option(
        "--block", action="store_true", help="preserve block structure in input"
    )
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, ksfiles = args[0], args[1:]
    noheader = opts.noheader
    block = opts.block
    if block:
        noheader = True
    outfile = opts.outfile

    ksvals = {}
    for ksfile in ksfiles:
        ksvals.update(
            dict(
                (line.name, line)
                for line in KsFile(ksfile, strip_names=opts.strip_names)
            )
        )

    fp = open(pairsfile)
    fw = must_open(outfile, "w")

    if not noheader:
        print(fields, file=fw)

    i = j = 0
    for row in fp:
        if row[0] == "#":
            if block:
                print(row.strip(), file=fw)
            continue
        a, b = row.split()[:2]
        name = ";".join((a, b))
        if name not in ksvals:
            name = ";".join((b, a))
            if name not in ksvals:
                j += 1
                print("\t".join((a, b, ".", ".")), file=fw)
                continue
        ksline = ksvals[name]
        if block:
            print("\t".join(str(x) for x in (a, b, ksline.ks)), file=fw)
        else:
            ksline.name = ";".join((a, b))
            print(ksline, file=fw)
        i += 1
    fw.close()

    logging.debug("{0} pairs not found in ksfiles".format(j))
    logging.debug("{0} ks records written to `{1}`".format(i, outfile))
    return outfile
Example #44
0
def qc(args):
    """
    %prog qc prefix

    Expects data files including:
    1. `prefix.bedpe` draws Bezier curve between paired reads
    2. `prefix.sizes` draws length of the contig/scaffold
    3. `prefix.gaps.bed` mark the position of the gaps in sequence
    4. `prefix.bed.coverage` plots the base coverage
    5. `prefix.pairs.bed.coverage` plots the clone coverage

    See assembly.coverage.posmap() for the generation of these files.
    """
    from jcvi.graphics.glyph import Bezier

    p = OptionParser(qc.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (prefix, ) = args
    scf = prefix

    # All these files *must* be present in the current folder
    bedpefile = prefix + ".bedpe"
    fastafile = prefix + ".fasta"
    sizesfile = prefix + ".sizes"
    gapsbedfile = prefix + ".gaps.bed"
    bedfile = prefix + ".bed"
    bedpefile = prefix + ".bedpe"
    pairsbedfile = prefix + ".pairs.bed"

    sizes = Sizes(fastafile).mapping
    size = sizes[scf]

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    # the scaffold
    root.add_patch(Rectangle((0.1, 0.15), 0.8, 0.03, fc="k"))

    # basecoverage and matecoverage
    ax = fig.add_axes([0.1, 0.45, 0.8, 0.45])

    bins = 200  # Smooth the curve
    basecoverage = Coverage(bedfile, sizesfile)
    matecoverage = Coverage(pairsbedfile, sizesfile)

    x, y = basecoverage.get_plot_data(scf, bins=bins)
    (baseline, ) = ax.plot(x, y, "g-")
    x, y = matecoverage.get_plot_data(scf, bins=bins)
    (mateline, ) = ax.plot(x, y, "r-")
    legends = ("Base coverage", "Mate coverage")
    leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    ax.set_xlim(0, size)

    # draw the read pairs
    fp = open(bedpefile)
    pairs = []
    for row in fp:
        scf, astart, aend, scf, bstart, bend, clonename = row.split()
        astart, bstart = int(astart), int(bstart)
        aend, bend = int(aend), int(bend)
        start = min(astart, bstart) + 1
        end = max(aend, bend)
        pairs.append((start, end))

    bpratio = 0.8 / size
    cutoff = 1000  # inserts smaller than this are not plotted
    # this convert from base => x-coordinate
    pos = lambda x: (0.1 + x * bpratio)
    ypos = 0.15 + 0.03
    for start, end in pairs:
        dist = end - start

        if dist < cutoff:
            continue

        dist = min(dist, 10000)
        # 10Kb == .25 canvas height
        height = 0.25 * dist / 10000
        xstart = pos(start)
        xend = pos(end)
        p0 = (xstart, ypos)
        p1 = (xstart, ypos + height)
        p2 = (xend, ypos + height)
        p3 = (xend, ypos)
        Bezier(root, p0, p1, p2, p3)

    # gaps on the scaffold
    fp = open(gapsbedfile)
    for row in fp:
        b = BedLine(row)
        start, end = b.start, b.end
        xstart = pos(start)
        xend = pos(end)
        root.add_patch(Rectangle((xstart, 0.15), xend - xstart, 0.03, fc="w"))

    root.text(0.5, 0.1, scf, color="b", ha="center")
    warn_msg = "Only the inserts > {0}bp are shown".format(cutoff)
    root.text(0.5, 0.1, scf, color="b", ha="center")
    root.text(0.5, 0.05, warn_msg, color="gray", ha="center")
    # clean up and output
    set_human_base_axis(ax)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = prefix + ".pdf"
    savefig(figname, dpi=300)
Example #45
0
File: pasa.py Project: BrokeW/jcvi
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option(
        "--prepare",
        default=False,
        action="store_true",
        help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error(
            "PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    mkdir(pasa_db)
    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(
            dnfasta, accn_extract, tdn)
        write_file(runfile, accn_extract_cmd, append=True) \
                if prepare else sh(accn_extract_cmd)
    else:
        transcripts = dnfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus)
        if prepare:
            write_file(runfile, cleancmd, append=True)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice)
    aafw.close()

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome)
    aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \
             " -t {0} ".format(transcripts)
    if ggfasta:
        aacmd += " --TDN {0} ".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        write_file(runfile, aacmd, append=True)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf,
                                                transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(
            compreh_pctid, compreh_pctcov)

        if prepare:
            write_file(runfile, comprehcmd, append=True)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
Example #46
0
def last(args, dbtype=None):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option(
        "--dbtype",
        default="nucl",
        choices=("nucl", "prot"),
        help="Molecule type of subject database",
    )
    p.add_option("--path", help="Specify LAST path")
    p.add_option(
        "--mask", default=False, action="store_true", help="Invoke -c in lastdb"
    )
    p.add_option(
        "--format",
        default="BlastTab",
        choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
        help="Output format",
    )
    p.add_option(
        "--minlen",
        default=0,
        type="int",
        help="Filter alignments by how many bases match",
    )
    p.add_option("--minid", default=0, type="int", help="Minimum sequence identity")
    p.set_cpus()
    p.set_outdir()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    if not dbtype:
        dbtype = opts.dbtype
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(
        infile=subject,
        outfile=subjectdb + ".prj",
        mask=opts.mask,
        lastdb_bin=lastdb_bin,
        dbtype=dbtype,
    )

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir)
    sh(cmd, outfile=lastfile)
    return lastfile
Example #47
0
File: pasa.py Project: BrokeW/jcvi
def longest(args):
    """
    %prog longest pasa.fasta output.subclusters.out

    Find the longest PASA assembly and label it as full-length. Also removes
    transcripts shorter than half the length of the longest, or shorter than
    200bp. The assemblies for the same locus is found in
    `output.subclusters.out`. In particular the lines that look like:

    sub-cluster: asmbl_25 asmbl_26 asmbl_27
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--prefix",
                 default="pasa",
                 help="Replace asmbl_ with prefix [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, subclusters = args
    prefix = fastafile.rsplit(".", 1)[0]

    idsfile = prefix + ".fl.ids"
    fw = open(idsfile, "w")
    sizes = Sizes(fastafile).mapping

    name_convert = lambda x: x.replace("asmbl", opts.prefix)

    keep = set()  # List of IDs to write
    fp = open(subclusters)
    nrecs = 0
    for row in fp:
        if not row.startswith("sub-cluster:"):
            continue
        asmbls = row.split()[1:]
        longest_asmbl = max(asmbls, key=lambda x: sizes[x])
        longest_size = sizes[longest_asmbl]
        print >> fw, name_convert(longest_asmbl)
        nrecs += 1
        cutoff = max(longest_size / 2, 200)
        keep.update(set(x for x in asmbls if sizes[x] >= cutoff))

    fw.close()
    logging.debug("{0} fl-cDNA records written to `{1}`.".format(
        nrecs, idsfile))

    f = Fasta(fastafile, lazy=True)
    newfastafile = prefix + ".clean.fasta"
    fw = open(newfastafile, "w")
    nrecs = 0
    for name, rec in f.iteritems_ordered():
        if name not in keep:
            continue

        rec.id = name_convert(name)
        rec.description = ""
        SeqIO.write([rec], fw, "fasta")
        nrecs += 1

    fw.close()
    logging.debug("{0} valid records written to `{1}`.".format(
        nrecs, newfastafile))
Example #48
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option(
        "--longest",
        action="store_true",
        help="Get longest ORF, only works if no pep file, e.g. ESTs",
    )
    p.add_option(
        "--msa",
        default="clustalw",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins",
    )
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print(fields, file=output_h)
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip(
        prot_iterator, prot_iterator, dna_iterator, dna_iterator
    ):

        print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr)
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous(
                mrtrans_fasta, work_dir
            )
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write(
                    "%s\n"
                    % (
                        ",".join(
                            str(x)
                            for x in (
                                pair_name,
                                ds_subs_yn,
                                dn_subs_yn,
                                ds_subs_ng,
                                dn_subs_ng,
                            )
                        )
                    )
                )
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Example #49
0
def gss(args):
    """
    %prog gss fastafile plateMapping

    Generate sequence files and metadata templates suited for gss submission.
    The FASTA file is assumed to be exported from the JCVI data delivery folder
    which looks like:

    >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0
    /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169
    /trace_file_id=1127963805941 /clone_insert_id=1061064364776
    /direction=reverse /sequencer_run_id=1064147620155
    /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3
    /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02
    /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB
    AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG
    ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT
    CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT

    Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs.
    For example:
    B906423 SIL-001
    """
    p = OptionParser(gss.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    fastafile, mappingfile = args
    seen = defaultdict(int)
    clone = defaultdict(set)

    plateMapping = DictFile(mappingfile)

    fw = open("MetaData.txt", "w")
    print >> fw, PublicationTemplate.format(**vars)
    print >> fw, LibraryTemplate.format(**vars)
    print >> fw, ContactTemplate.format(**vars)
    logging.debug("Meta data written to `{0}`".format(fw.name))

    fw = open("GSS.txt", "w")
    fw_log = open("GSS.log", "w")
    for rec in SeqIO.parse(fastafile, "fasta"):
        # First pass just check well number matchings and populate sequences in
        # the same clone
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        sequencer_plate_96well_quadrant = \
            a["sequencer_plate_96well_quadrant"][0]
        sequencer_plate_96well_coordinates = \
            a["sequencer_plate_96well_coordinates"][0]

        # Check the 96-well ID is correctly converted to 384-well ID
        w96 = sequencer_plate_96well_coordinates
        w96quad = int(sequencer_plate_96well_quadrant)
        w384 = sequencer_plate_well_coordinates
        assert convert_96_to_384(w96, w96quad) == w384

        plate = sequencer_plate_barcode
        assert plate in plateMapping, \
            "{0} not found in `{1}` !".format(plate, mappingfile)

        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            gssID = "{0}{1}".format(gssID, seen[gssID])

        seen[gssID] += 1
        clone[cloneID].add(gssID)

    seen = defaultdict(int)
    for rec in SeqIO.parse(fastafile, "fasta"):
        # need to populate gssID, mateID, cloneID, seq, plate, row, column
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        w384 = sequencer_plate_well_coordinates

        plate = sequencer_plate_barcode
        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            logging.error("duplicate key {0} found".format(gssID))
            gssID = "{0}{1}".format(gssID, seen[gssID])

        othergss = clone[cloneID] - set([gssID])
        othergss = ", ".join(sorted(othergss))
        vars.update(locals())

        print >> fw, GSSTemplate.format(**vars)

        # Write conversion logs to log file
        print >> fw_log, "{0}\t{1}".format(gssID, description)
        print >> fw_log, "=" * 60

    logging.debug("A total of {0} seqs written to `{1}`".\
            format(len(seen), fw.name))
    fw.close()
    fw_log.close()
Example #50
0
File: pasa.py Project: BrokeW/jcvi
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through every gene locus and identify all cases of same and
    different isoforms across the different input datasets.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    gene_coords = {}
    mrna = AutoVivification()
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)
        for gene in gffdbx[dbn].features_of_type('gene',
                                                 order_by=('seqid', 'start')):
            if gene.id not in gene_coords:
                gene_coords[gene.id] = []
            gene_coords[gene.id].extend([gene.start, gene.stop])

            c = list(gffdbx[dbn].children(gene,
                                          featuretype='mRNA',
                                          order_by='start'))
            if len(c) > 0:
                mrna[gene.id][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    summary = ["id"]
    summary.extend(gffdbx.keys())
    print >> sys.stderr, "\t".join(str(x) for x in summary)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                for mrna1, mrna2 in product(mrna[gene][dbn1],
                                            mrna[gene][dbn2]):
                    g.join((dbn1, mrna1.id))
                    g.join((dbn2, mrna2.id))

                    fUTR, tUTR = None, None
                    if match_subfeats(mrna1, mrna2, gffdbx[dbn1],
                                      gffdbx[dbn2]):
                        fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='five_prime_UTR', slop=slop)
                        tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='three_prime_UTR', slop=slop)

                    if fUTR and tUTR:
                        g.join((dbn1, mrna1.id), (dbn2, mrna2.id))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id))

        dbn = mrna[gene].keys()[0]
        gene_coords[gene].sort()
        _gene = gffdbx[dbn][gene]
        _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1]
        print >> fw, _gene

        logging.debug(list(g))
        for group in g:
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]
            if slop:
                mlen = 0
                for D, M in zip(dbs, mrnas):
                    _mrna = gffdbx[D][M]
                    _mlen = (_mrna.stop - _mrna.start) + 1
                    if _mlen > mlen:
                        d, m, mlen = D, M, _mlen

            dbid, _mrnaid = "".join(str(x) for x in set(dbs)), []
            _mrnaid = [x for x in mrnas if x not in _mrnaid]
            mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid))

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            summary = [mrnaid]
            summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
            print >> sys.stderr, "\t".join(str(x) for x in summary)

    fw.close()
Example #51
0
def htgnew(args):
    """
    %prog htgnew fastafile phasefile template.sbt

    Prepare sqnfiles for submitting new Genbank HTG records.

    `fastafile` contains the sequences.
    `phasefile` contains the phase information, it is a two column file:

    mth2-45h12    3

    `template.sbt` is the Genbank submission template.

    This function is simpler than htg, since the record names have not be
    assigned yet (so less bookkeeping).
    """
    from jcvi.formats.fasta import sequin

    p = OptionParser(htgnew.__doc__)
    p.add_option("--comment",
                 default="",
                 help="Comments for this submission [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    fastafile, phasefile, sbtfile = args
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"'
    acmd += ' -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    nupdated = 0
    for row in open(phasefile):
        name, phase = row.split()[:2]
        fafile = op.join(fastadir, name + ".fa")
        cloneopt = "--clone={0}".format(name)
        splitfile, gaps = sequin([fafile, cloneopt])
        splitfile = op.basename(splitfile)
        accession = accession_nv = name

        phase = int(phase)
        assert phase in (1, 2, 3)

        cmd = acmd.format(accession_nv=accession_nv,
                          sqndir=sqndir,
                          sbtfile=sbtfile,
                          splitfile=splitfile,
                          phase=phase,
                          comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
Example #52
0
File: pasa.py Project: BrokeW/jcvi
def compare(args):
    """
    %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff]

    Run the PASA annotation comparison pipeline

    If annotation.gff file is provided, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option(
        "--prepare",
        default=False,
        action="store_true",
        help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, transcripts, = args[:3]
    annotation = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error(
            "PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    acfw = must_open(acconf, "w")
    print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons)
    acfw.close()

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, genome, transcripts, opts.genetic_code)
    if annotation:
        accmd += " -L --annots_gff3 {0}".format(annotation)
    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Example #53
0
def pairinplace(args):
    """
    %prog pairinplace bulk.fastq

    Pair up the records in bulk.fastq by comparing the names for adjancent
    records. If they match, print to bulk.pairs.fastq, else print to
    bulk.frags.fastq.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(pairinplace.__doc__)
    p.set_rclip()
    p.set_tag()
    p.add_option("--base",
                 help="Base name for the output files [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    base = opts.base or op.basename(fastqfile).split(".")[0]

    frags = base + ".frags.fastq"
    pairs = base + ".pairs.fastq"
    if fastqfile.endswith(".gz"):
        frags += ".gz"
        pairs += ".gz"

    fragsfw = must_open(frags, "w")
    pairsfw = must_open(pairs, "w")

    N = opts.rclip
    tag = opts.tag
    strip_name = (lambda x: x[:-N]) if N else None

    fh_iter = iter_fastq(fastqfile, key=strip_name)
    skipflag = False  # controls the iterator skip
    for a, b in pairwise(fh_iter):
        if b is None:  # hit the eof
            break

        if skipflag:
            skipflag = False
            continue

        if a.name == b.name:
            if tag:
                a.name += "/1"
                b.name += "/2"
            print >> pairsfw, a
            print >> pairsfw, b
            skipflag = True
        else:
            print >> fragsfw, a

    # don't forget the last one, when b is None
    if not skipflag:
        print >> fragsfw, a

    logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags))
    return pairs
Example #54
0
def htg(args):
    """
    %prog htg fastafile template.sbt

    Prepare sqnfiles for Genbank HTG submission to update existing records.

    `fastafile` contains the records to update, multiple records are allowed
    (with each one generating separate sqn file in the sqn/ folder). The record
    defline has the accession ID. For example,
    >AC148290.3

    Internally, this generates two additional files (phasefile and namesfile)
    and download records from Genbank. Below is implementation details:

    `phasefile` contains, for each accession, phase information. For example:
    AC148290.3      3       HTG     2       mth2-45h12

    which means this is a Phase-3 BAC. Record with only a single contig will be
    labeled as Phase-3 regardless of the info in the `phasefile`. Template file
    is the Genbank sbt template. See jcvi.formats.sbt for generation of such
    files.

    Another problem is that Genbank requires the name of the sequence to stay
    the same when updating and will kick back with a table of name conflicts.
    For example:

    We are unable to process the updates for these entries
    for the following reason:

    Seqname has changed

    Accession Old seq_name New seq_name
    --------- ------------ ------------
    AC239792 mtg2_29457 AC239792.1

    To prepare a submission, this script downloads genbank and asn.1 format,
    and generate the phase file and the names file (use formats.agp.phase() and
    apps.gbsubmit.asn(), respectively). These get automatically run.

    However, use --phases if the genbank files contain outdated information.
    For example, the clone name changes or phase upgrades. In this case, run
    formats.agp.phase() manually, modify the phasefile and use --phases to override.
    """
    from jcvi.formats.fasta import sequin, ids
    from jcvi.formats.agp import phase
    from jcvi.apps.entrez import fetch

    p = OptionParser(htg.__doc__)
    p.add_option("--phases",
                 default=None,
                 help="Use another phasefile to override [default: %default]")
    p.add_option("--comment",
                 default="",
                 help="Comments for this update [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, sbtfile = args
    pf = fastafile.rsplit(".", 1)[0]

    idsfile = pf + ".ids"
    phasefile = pf + ".phases"
    namesfile = pf + ".names"

    ids([fastafile, "--outfile={0}".format(idsfile)])

    asndir = "asn.1"
    mkdir(asndir)
    fetch([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)])
    asn(glob("{0}/*".format(asndir)) + \
            ["--outfile={0}".format(namesfile)])

    if opts.phases is None:
        gbdir = "gb"
        mkdir(gbdir)
        fetch([idsfile, "--format=gb", "--outdir={0}".format(gbdir)])
        phase(glob("{0}/*".format(gbdir)) + \
                ["--outfile={0}".format(phasefile)])
    else:
        phasefile = opts.phases

    assert op.exists(namesfile) and op.exists(phasefile)

    newphasefile = phasefile + ".new"
    newphasefw = open(newphasefile, "w")
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    from jcvi.graphics.histogram import stem_leaf_plot

    names = DictFile(namesfile)
    assert len(set(names.keys())) == len(set(names.values()))

    phases = DictFile(phasefile)
    ph = [int(x) for x in phases.values()]
    # vmin 1, vmax 4, bins 3
    stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates")
    logging.debug("Information loaded for {0} records.".format(len(phases)))
    assert len(names) == len(phases)

    newph = []

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "{qualifiers}"'
    acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"

    nupdated = 0
    for row in open(phasefile):
        atoms = row.rstrip().split("\t")
        # see formats.agp.phase() for column contents
        accession, phase, clone = atoms[0], atoms[1], atoms[-1]
        fafile = op.join(fastadir, accession + ".fa")
        accession_nv = accession.split(".", 1)[0]

        newid = names[accession_nv]
        newidopt = "--newid={0}".format(newid)
        cloneopt = "--clone={0}".format(clone)
        splitfile, gaps = sequin([fafile, newidopt, cloneopt])
        splitfile = op.basename(splitfile)
        phase = int(phase)
        assert phase in (1, 2, 3)

        oldphase = phase
        if gaps == 0 and phase != 3:
            phase = 3

        if gaps != 0 and phase == 3:
            phase = 2

        print >> newphasefw, "{0}\t{1}\t{2}".\
                format(accession_nv, oldphase, phase)
        newph.append(phase)

        qualifiers = qq.format(phase=phase)
        if ";" in clone:
            qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]"

        cmd = acmd.format(accession=accession,
                          accession_nv=accession_nv,
                          sqndir=sqndir,
                          sbtfile=sbtfile,
                          splitfile=splitfile,
                          qualifiers=qualifiers,
                          comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates")
    print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
Example #55
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome using Globus API. Available
    species listed below. Use comma to give a list of species to download. For
    example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum

    The downloader will prompt you to enter Phytozome user name and password
    during downloading. Please register for a login at:
    https://phytozome.jgi.doe.gov/pz/portal.html.
    """
    from jcvi.apps.biomart import GlobusXMLParser

    p = OptionParser(phytozome.__doc__)
    p.add_option(
        "--version",
        default="12",
        choices=("9", "10", "11", "12", "12_unrestricted"),
        help="Phytozome version",
    )
    p.add_option(
        "--assembly",
        default=False,
        action="store_true",
        help="Download assembly [default: %default]",
    )
    p.add_option(
        "--format",
        default=False,
        action="store_true",
        help="Format to CDS and BED for synteny inference",
    )
    opts, args = p.parse_args(args)

    cookies = get_cookies()
    directory_listing = ".phytozome_directory_V{}.xml".format(opts.version)
    # Get directory listing
    base_url = "http://genome.jgi.doe.gov"
    dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format(
        base_url, opts.version)
    d = download(dlist, filename=directory_listing, cookies=cookies)
    g = GlobusXMLParser(directory_listing)
    genomes = g.get_genomes()
    valid_species = genomes.keys()
    species_tile = tile(valid_species)
    p.set_usage("\n".join((phytozome.__doc__, species_tile)))

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    for s in species:
        res = download_species_phytozome(genomes,
                                         s,
                                         valid_species,
                                         base_url,
                                         cookies,
                                         assembly=opts.assembly)
        if not res:
            logging.error("No files downloaded")
        gff, fa = res.get("gff"), res.get("cds")
        if opts.format:
            format_bed_and_cds(s, gff, fa)
Example #56
0
def names(args):
    """
    %prog names namelist templatefile

    Generate name blocks from the `namelist` file. The `namelist` file is
    tab-delimited that contains >=4 columns of data. Three columns are mandatory.
    First name, middle initial and last name. First row is table header. For the
    extra columns, the first column will go in the `$N0` field in the template
    file, second to the `$N1` field, etc.

    In the alternative mode, the namelist just contains several sections. First
    row will go in the `$N0` in the template file, second to the `$N1` field.

    The namelist may look like:
    [Sequence]
    Bruce A. Roe,  Frederic Debelle, Giles Oldroyd, Rene Geurts
    [Manuscript]
    Haibao Tang1, Vivek Krishnakumar1, Shelby Bidwell1, Benjamin Rosen1

    Then in this example Sequence section goes into N0, Manuscript goes into N1.

    Useful hints for constructing the template file can be found in:
    <http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/seq.asn.html>

    Often the template file can be retrieved from web form:
    <http://www.ncbi.nlm.nih.gov/WebSub/template.cgi>
    """
    p = OptionParser(names.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    namelist, templatefile = args

    # First check the alternative format
    if open(namelist).read()[0] == '[':
        out = parse_names(namelist)
        make_template(templatefile, out)
        return

    reader = csv.reader(open(namelist), delimiter="\t")
    header = reader.next()
    ncols = len(header)
    assert ncols > 3
    nextras = ncols - 3

    blocks = []
    bools = []
    for row in reader:
        first, middle, last = row[:3]
        extras = row[3:]
        bools.append([(x.upper() == 'Y') for x in extras])
        middle = middle.strip()
        if middle != "":
            middle = middle.rstrip('.') + '.'
        initials = "{0}.{1}".format(first[0], middle)
        suffix = ""
        nameblock = NameTemplate.format(last=last,
                                        first=first,
                                        initials=initials,
                                        suffix=suffix)
        blocks.append(nameblock)

    selected_idx = zip(*bools)
    out = [] * nextras
    for i, sbools in enumerate(selected_idx):
        selected = []
        for b, ss in zip(blocks, sbools):
            if ss:
                selected.append(b)
        bigblock = ",\n".join(selected)
        out.append(bigblock)
        logging.debug("List N{0} contains a total of {1} names.".format(
            i, len(selected)))

    make_template(templatefile, out)
Example #57
0
def astat(args):
    """
    %prog astat coverage.log

    Create coverage-rho scatter plot.
    """
    p = OptionParser(astat.__doc__)
    p.add_option("--cutoff", default=1000, type="int",
                 help="Length cutoff [default: %default]")
    p.add_option("--genome", default="",
                 help="Genome name [default: %default]")
    p.add_option("--arrDist", default=False, action="store_true",
                 help="Use arrDist instead [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    covfile, = args
    cutoff = opts.cutoff
    genome = opts.genome
    plot_arrDist = opts.arrDist

    suffix = ".{0}".format(cutoff)
    small_covfile = covfile + suffix
    update_covfile = need_update(covfile, small_covfile)
    if update_covfile:
        fw = open(small_covfile, "w")
    else:
        logging.debug("Found `{0}`, will use this one".format(small_covfile))
        covfile = small_covfile

    fp = open(covfile)
    header = fp.next()
    if update_covfile:
        fw.write(header)

    data = []
    msg = "{0} tigs scanned ..."
    for row in fp:
        tigID, rho, covStat, arrDist = row.split()
        tigID = int(tigID)
        if tigID % 1000000 == 0:
            sys.stderr.write(msg.format(tigID) + "\r")

        rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)]
        if rho < cutoff:
            continue

        if update_covfile:
            fw.write(row)
        data.append((tigID, rho, covStat, arrDist))

    print >> sys.stderr, msg.format(tigID)

    from jcvi.graphics.base import plt, savefig

    logging.debug("Plotting {0} data points.".format(len(data)))
    tigID, rho, covStat, arrDist = zip(*data)

    y = arrDist if plot_arrDist else covStat
    ytag = "arrDist" if plot_arrDist else "covStat"

    fig = plt.figure(1, (7, 7))
    ax = fig.add_axes([.12, .1, .8, .8])
    ax.plot(rho, y, ".", color="lightslategrey")

    xtag = "rho"
    info = (genome, xtag, ytag)
    title = "{0} {1} vs. {2}".format(*info)
    ax.set_title(title)
    ax.set_xlabel(xtag)
    ax.set_ylabel(ytag)

    if plot_arrDist:
        ax.set_yscale('log')

    imagename = "{0}.png".format(".".join(info))
    savefig(imagename, dpi=150)
Example #58
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
        "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein",
                       "gene")

    p.add_option(
        "--noversion",
        dest="noversion",
        default=False,
        action="store_true",
        help="Remove trailing accession versions",
    )
    p.add_option(
        "--format",
        default="fasta",
        choices=valid_formats,
        help="download format [default: %default]",
    )
    p.add_option(
        "--database",
        default="nuccore",
        choices=valid_databases,
        help="search database [default: %default]",
    )
    p.add_option(
        "--retmax",
        default=1000000,
        type="int",
        help="how many results to return [default: %default]",
    )
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence [default: %default]",
    )
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up [default: %default]",
    )
    p.set_outdir(outdir=None)
    p.add_option("--outprefix",
                 default="out",
                 help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert (database in allowed_databases[fmt]
            ), "For output format '{0}', allowed databases are: {1}".format(
                fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile,
                       "w",
                       checkexists=True,
                       skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(
            list_of_terms,
            retmax=opts.retmax,
            rettype=fmt,
            db=database,
            batchsize=batchsize,
            email=opts.email,
    ):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile,
                           "w",
                           checkexists=True,
                           skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        print(
            "A total of {0} {1} records downloaded.".format(
                totalsize, fmt.upper()),
            file=sys.stderr,
        )

    return outfile
Example #59
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("--clean", default=False, action="store_true",
                 help="Clean up irregular chars in seq")
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen", default=0, type="int",
                 help="Maximum read length allowed")
    p.add_option("--minreadlen", default=1000, type="int",
                 help="Minimum read length allowed")
    p.add_option("--readname", default=False, action="store_true",
                 help="Keep read name (e.g. long Pacbio name)")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\
                                format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate[:2].upper()

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print >> fw, headerTemplate.format(libID=libname)

    sequential = not opts.readname
    f = Fasta(fastafile, lazy=True)
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".\
                    format(i, frgfile, j))
Example #60
0
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option("--erate",
                 default=.02,
                 type="float",
                 help="Base error rate of the read [default: %default]")
    p.add_option(
        "--distance",
        default=500,
        type="int",
        help="Outer distance between the two ends [default: %default]")
    p.add_option("--genomesize",
                 type="int",
                 help="Genome size in Mb [default: estimate from data]")
    p.add_option("--readlen",
                 default=100,
                 type="int",
                 help="Length of the read [default: %default]")
    p.add_option("--noerrors",
                 default=False,
                 action="store_true",
                 help="Simulate reads with no errors [default: %default]")
    p.set_depth(depth=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    pf = fastafile.split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = size * depth / (2 * readlen)

    distance = opts.distance
    stdev = distance / 5

    outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth)
    distance -= 2 * readlen  # Outer distance => Inner distance
    assert distance >= 0, "Outer distance must be >= 2 * readlen"

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)