Beispiel #1
0
def subset(args):
    """
    %prog subset blastfile qbedfile sbedfile

    Extract blast hits between given query and subject chrs.

    If --qchrs or --schrs is not given, then all chrs from q/s genome will
    be included. However one of --qchrs and --schrs must be specified.
    Otherwise the script will do nothing.
    """
    p = OptionParser(subset.__doc__)
    p.add_option("--qchrs", default=None,
                help="query chrs to extract, comma sep [default: %default]")
    p.add_option("--schrs", default=None,
                help="subject chrs to extract, comma sep [default: %default]")
    p.add_option("--convert", default=False, action="store_true",
            help="convert accns to chr_rank [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    blastfile, qbedfile, sbedfile = args
    qchrs = opts.qchrs
    schrs = opts.schrs
    assert qchrs or schrs, p.print_help()
    convert = opts.convert

    outfile = blastfile + "."
    if qchrs:
        outfile += qchrs + "."
        qchrs = set(qchrs.split(","))
    else:
        qchrs = set(Bed(qbedfile).seqids)
    if schrs:
        schrs = set(schrs.split(","))
        if qbedfile != sbedfile or qchrs != schrs:
            outfile += ",".join(schrs) + "."
    else:
        schrs = set(Bed(sbedfile).seqids)
    outfile += "blast"

    qo = Bed(qbedfile).order
    so = Bed(sbedfile).order

    fw = must_open(outfile, "w")
    for b in Blast(blastfile):
        q, s = b.query, b.subject
        if qo[q][1].seqid in qchrs and so[s][1].seqid in schrs:
            if convert:
                b.query = qo[q][1].seqid + "_" + "{0:05d}".format(qo[q][0])
                b.subject = so[s][1].seqid + "_" + "{0:05d}".format(so[s][0])
            print >> fw, b
    fw.close()
    logging.debug("Subset blastfile written to `{0}`".format(outfile))
Beispiel #2
0
def links(args):
    """
    %prog links url

    Extract all the links "<a href=''>" from web page.
    """
    p = OptionParser(links.__doc__)
    p.add_option("--img", default=False, action="store_true",
                 help="Extract <img> tags [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    url, = args
    img = opts.img

    htmlfile = download(url)
    page = open(htmlfile).read()
    soup = BeautifulSoup(page)

    tag = 'img' if img else 'a'
    src = 'src' if img else 'href'
    aa = soup.findAll(tag)
    for a in aa:
        link = a.get(src)
        link = urljoin(url, link)
        print(link)
Beispiel #3
0
def traits(args):
    """
    %prog traits directory

    Make HTML page that reports eye and skin color.
    """
    p = OptionParser(traits.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    samples = []
    for folder in args:
        targets = iglob(folder, "*-traits.json")
        if not targets:
            continue
        filename = targets[0]
        js = json.load(open(filename))
        js["skin_rgb"] = make_rgb(
            js["traits"]["skin-color"]["L"],
            js["traits"]["skin-color"]["A"],
            js["traits"]["skin-color"]["B"])
        js["eye_rgb"] = make_rgb(
            js["traits"]["eye-color"]["L"],
            js["traits"]["eye-color"]["A"],
            js["traits"]["eye-color"]["B"])
        samples.append(js)

    template = Template(traits_template)
    fw = open("report.html", "w")
    print >> fw, template.render(samples=samples)
    logging.debug("Report written to `{}`".format(fw.name))
    fw.close()
Beispiel #4
0
def diff(args):
    """
    %prog diff simplefile

    Calculate difference of pairwise syntenic regions.
    """
    from jcvi.utils.cbook import SummaryStats

    p = OptionParser(diff.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    simplefile, = args
    fp = open(simplefile)
    data = [x.split() for x in fp]
    spans = []
    for block_id, ab in groupby(data[1:], key=lambda x: x[0]):
        a, b = list(ab)
        aspan, bspan = a[4], b[4]
        aspan, bspan = int(aspan), int(bspan)
        spans.append((aspan, bspan))
    aspans, bspans = zip(*spans)
    dspans = [b - a for a, b, in spans]
    s = SummaryStats(dspans)
    print >> sys.stderr, "For a total of {0} blocks:".format(len(dspans))
    print >> sys.stderr, "Sum of A: {0}".format(sum(aspans))
    print >> sys.stderr, "Sum of B: {0}".format(sum(bspans))
    print >> sys.stderr, "Sum of Delta: {0} ({1})".format(sum(dspans), s)
Beispiel #5
0
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Beispiel #6
0
def flip(args):
    """
    %prog flip fastafile

    Go through each FASTA record, check against Genbank file and determines
    whether or not to flip the sequence. This is useful before updates of the
    sequences to make sure the same orientation is used.
    """
    p = OptionParser(flip.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    outfastafile = fastafile.rsplit(".", 1)[0] + ".flipped.fasta"
    fo = open(outfastafile, "w")
    f = Fasta(fastafile, lazy=True)
    for name, rec in f.iteritems_ordered():
        tmpfasta = "a.fasta"
        fw = open(tmpfasta, "w")
        SeqIO.write([rec], fw, "fasta")
        fw.close()

        o = overlap([tmpfasta, name])
        if o.orientation == '-':
            rec.seq = rec.seq.reverse_complement()

        SeqIO.write([rec], fo, "fasta")
        os.remove(tmpfasta)
Beispiel #7
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Beispiel #8
0
def summary(args):
    """
    %prog summary *.gff

    Print gene statistics table.
    """
    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    for metric in metrics:
        logging.debug("Parsing files in `{0}`..".format(metric))

        table = {}
        for x in gff_files:
            pf = op.basename(x).split(".")[0]
            numberfile = op.join(metric, pf + ".txt")
            ar = [int(x.strip()) for x in open(numberfile)]
            sum = SummaryStats(ar).todict().items()
            keys, vals = zip(*sum)
            keys = [(pf, x) for x in keys]
            table.update(dict(zip(keys, vals)))

        print >> sys.stderr, tabulate(table)
Beispiel #9
0
def histogram(args):
    """
    %prog histogram *.gff

    Plot gene statistics based on output of stats. For each gff file, look to
    see if the metrics folder (i.e. Exon_Length) contains the data and plot
    them.
    """
    from jcvi.graphics.histogram import histogram_multiple

    p = OptionParser(histogram.__doc__)
    p.add_option("--bins", dest="bins", default=40, type="int",
            help="number of bins to plot in the histogram [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    gff_files = args
    # metrics = ("Exon_Length", "Intron_Length", "Gene_Length", "Exon_Count")
    colors = ("red", "green", "blue", "black")
    vmaxes = (1000, 1000, 4000, 20)
    xlabels = ("bp", "bp", "bp", "number")
    for metric, color, vmax, xlabel in zip(metrics, colors, vmaxes, xlabels):
        logging.debug("Parsing files in `{0}`..".format(metric))
        numberfiles = [op.join(metric, op.basename(x).split(".")[0] + ".txt") \
                        for x in gff_files]

        histogram_multiple(numberfiles, 0, vmax, xlabel, metric,
                       bins=opts.bins, facet=True, fill=color,
                       prefix=metric + ".")
Beispiel #10
0
def unitigs(args):
    """
    %prog unitigs best.edges

    Reads Celera Assembler's "best.edges" and extract all unitigs.
    """
    p = OptionParser(unitigs.__doc__)
    p.add_option("--maxerr", default=2, type="int", help="Maximum error rate")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bestedges, = args
    G = read_graph(bestedges, maxerr=opts.maxerr, directed=True)
    H = nx.Graph()
    intconv = lambda x: int(x.split("-")[0])
    for k, v in G.iteritems():
        if k == G.get(v, None):
            H.add_edge(intconv(k), intconv(v))

    nunitigs = nreads = 0
    for h in nx.connected_component_subgraphs(H, copy=False):
        st = [x for x in h if h.degree(x) == 1]
        if len(st) != 2:
            continue
        src, target = st
        path = list(nx.all_simple_paths(h, src, target))
        assert len(path) == 1
        path, = path
        print "|".join(str(x) for x in path)
        nunitigs += 1
        nreads += len(path)
    logging.debug("A total of {0} unitigs built from {1} reads.".format(nunitigs, nreads))
Beispiel #11
0
def tracedb(args):
    """
    %prog tracedb <xml|lib|frg>

    Run `tracedb-to-frg.pl` within current folder.
    """
    p = OptionParser(tracedb.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    action, = args
    assert action in ("xml", "lib", "frg")

    CMD = "tracedb-to-frg.pl"
    xmls = glob("xml*")

    if action == "xml":
        for xml in xmls:
            cmd = CMD + " -xml {0}".format(xml)
            sh(cmd, outfile="/dev/null", errfile="/dev/null", background=True)

    elif action == "lib":
        cmd = CMD + " -lib {0}".format(" ".join(xmls))
        sh(cmd)

    elif action == "frg":
        for xml in xmls:
            cmd = CMD + " -frg {0}".format(xml)
            sh(cmd, background=True)
Beispiel #12
0
def ids(args):
    """
    %prog ids cdhit.clstr

    Get the representative ids from clstr file.
    """
    p = OptionParser(ids.__doc__)
    p.add_option("--prefix", type="int",
                 help="Find rep id for prefix of len [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clstrfile, = args
    cf = ClstrFile(clstrfile)
    prefix = opts.prefix
    if prefix:
        reads = list(cf.iter_reps_prefix(prefix=prefix))
    else:
        reads = list(cf.iter_reps())

    nreads = len(reads)
    idsfile = clstrfile.replace(".clstr", ".ids")
    fw = open(idsfile, "w")
    for i, name in reads:
        print("\t".join(str(x) for x in (i, name)), file=fw)

    logging.debug("A total of {0} unique reads written to `{1}`.".\
            format(nreads, idsfile))
    fw.close()

    return idsfile
Beispiel #13
0
def csv(args):
    """
    %prog csv excelfile

    Convert EXCEL to csv file.
    """
    from xlrd import open_workbook

    p = OptionParser(csv.__doc__)
    p.set_sep(sep=',')
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    excelfile, = args
    sep = opts.sep
    csvfile = excelfile.rsplit(".", 1)[0] + ".csv"
    wb = open_workbook(excelfile)
    fw = open(csvfile, "w")
    for s in wb.sheets():
        print >> sys.stderr, 'Sheet:',s.name
        for row in range(s.nrows):
            values = []
            for col in range(s.ncols):
                values.append(s.cell(row, col).value)
            print >> fw, sep.join(str(x) for x in values)
Beispiel #14
0
def main():
    """
    %prog numbers1.txt number2.txt ...

    Print histogram of the data files. The data files contain one number per
    line. If more than one file is inputted, the program will combine the
    histograms into the same plot.
    """
    allowed_format = ("emf", "eps", "pdf", "png", "ps", \
                      "raw", "rgba", "svg", "svgz")
    p = OptionParser(main.__doc__)
    p.add_option("--skip", default=0, type="int",
            help="skip the first several lines [default: %default]")
    p.set_histogram()
    p.add_option("--tags", dest="tags", default=None,
            help="tags for data if multiple input files, comma sep")
    p.add_option("--ascii", default=False, action="store_true",
            help="print ASCII text stem-leaf plot [default: %default]")
    p.add_option("--base", default="0", choices=("0", "2", "10"),
            help="use logarithm axis with base, 0 to disable [default: %default]")
    p.add_option("--facet", default=False, action="store_true",
            help="place multiple histograms side-by-side [default: %default]")
    p.add_option("--fill", default="white",
            help="color of the bin [default: %default]")
    p.add_option("--format", default="pdf", choices=allowed_format,
            help="Generate image of format [default: %default]")
    p.add_option("--quick", default=False, action="store_true",
            help="Use quick plot, assuming bins are already counted")
    p.add_option("--noprintstats", default=False, action="store_true",
            help="Write basic stats when using --quick")
    opts, args = p.parse_args()

    if len(args) < 1:
        sys.exit(not p.print_help())

    skip = opts.skip
    vmin, vmax = opts.vmin, opts.vmax
    bins = opts.bins
    xlabel, title = opts.xlabel, opts.title
    title = title or args[0]
    base = int(opts.base)
    fileno = len(args)

    if opts.quick:
        assert fileno == 1, "Single input file expected using --quick"
        filename = args[0]
        figname = filename.rsplit(".", 1)[0] + ".pdf"
        data = DictFile(filename, keycast=int, cast=int)
        quickplot(data, vmin, vmax, xlabel, title, figname=figname,
                  print_stats=(not opts.noprintstats))
        return

    if fileno == 1:
        histogram(args[0], vmin, vmax, xlabel, title, outfmt=opts.format,
                bins=bins, skip=skip, ascii=opts.ascii,
                base=base, fill=opts.fill)
    else:
        histogram_multiple(args, vmin, vmax, xlabel, title, outfmt=opts.format,
                tags=opts.tags, bins=bins, skip=skip, ascii=opts.ascii,
                facet=opts.facet, fill=opts.fill)
Beispiel #15
0
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=None, evalue=.01)
    p.add_option("--wordsize", type="int", help="Word size [default: %default]")
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.add_option("--task", default="megablast", choices=task_choices,
            help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    q = op.basename(queryfasta).split(".")[0]
    r = op.basename(reffasta).split(".")[0]
    blastfile = "{0}.{1}.blast".format(q, r)

    run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta,
                  wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue,
                  hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus)

    return blastfile
Beispiel #16
0
def passthrough(args):
    """
    %prog passthrough chrY.vcf chrY.new.vcf

    Pass through Y and MT vcf.
    """
    p = OptionParser(passthrough.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, newvcffile = args
    fp = open(vcffile)
    fw = open(newvcffile, "w")
    gg = ["0/0", "0/1", "1/1"]
    for row in fp:
        if row[0] == "#":
            print(row.strip(), file=fw)
            continue

        v = VcfLine(row)
        v.filter = "PASS"
        v.format = "GT:GP"
        probs = [0] * 3
        probs[gg.index(v.genotype)] = 1
        v.genotype = v.genotype.replace("/", "|") + \
                ":{0}".format(",".join("{0:.3f}".format(x) for x in probs))
        print(v, file=fw)
    fw.close()
Beispiel #17
0
def agp(args):
    """
    %prog agp <fastafile|sizesfile>

    Convert the sizes file to a trivial AGP file.
    """
    from jcvi.formats.agp import OO

    p = OptionParser(agp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    sizesfile, = args
    sizes = Sizes(sizesfile)
    agpfile = sizes.filename.rsplit(".", 1)[0] + ".agp"
    fw = open(agpfile, "w")
    o = OO()  # Without a filename
    for ctg, size in sizes.iter_sizes():
        o.add(ctg, ctg, size)

    o.write_AGP(fw)
    fw.close()
    logging.debug("AGP file written to `{0}`.".format(agpfile))

    return agpfile
Beispiel #18
0
def nucmer(args):
    """
    %prog nucmer mappings.bed MTR.fasta assembly.fasta chr1 3

    Select specific chromosome region based on MTR mapping. The above command
    will extract chr1:2,000,001-3,000,000.
    """
    p = OptionParser(nucmer.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 5:
        sys.exit(not p.print_help())

    mapbed, mtrfasta, asmfasta, chr, idx = args
    idx = int(idx)
    m1 = 1000000
    bedfile = "sample.bed"
    bed = Bed()
    bed.add("\t".join(str(x) for x in (chr, (idx - 1) * m1, idx * m1)))
    bed.print_to_file(bedfile)

    cmd = "intersectBed -a {0} -b {1} -nonamecheck -sorted | cut -f4".format(mapbed, bedfile)
    idsfile = "query.ids"
    sh(cmd, outfile=idsfile)

    sfasta = fastaFromBed(bedfile, mtrfasta)
    qfasta = "query.fasta"
    cmd = "faSomeRecords {0} {1} {2}".format(asmfasta, idsfile, qfasta)
    sh(cmd)

    cmd = "nucmer {0} {1}".format(sfasta, qfasta)
    sh(cmd)

    mummerplot_main(["out.delta", "--refcov=0"])
    sh("mv out.pdf {0}.{1}.pdf".format(chr, idx))
Beispiel #19
0
def beagle(args):
    """
    %prog beagle input.vcf 1

    Use BEAGLE4.1 to impute vcf on chromosome 1.
    """
    p = OptionParser(beagle.__doc__)
    p.set_home("beagle")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, chr = args
    pf = vcffile.rsplit(".", 1)[0]
    outpf = pf + ".beagle"
    outfile = outpf + ".vcf.gz"

    mm = MakeManager()
    beagle_cmd = opts.beagle_home
    kg = op.join(opts.ref, "1000GP_Phase3")
    cmd = beagle_cmd + " gt={0}".format(vcffile)
    cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr)
    cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr)
    cmd += " out={0}".format(outpf)
    cmd += " nthreads=16 gprobs=true"
    mm.add(vcffile, outfile, cmd)

    mm.write()
Beispiel #20
0
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Beispiel #21
0
def group(args):
    """
    %prog group anchorfiles

    Group the anchors into ortho-groups. Can input multiple anchor files.
    """
    p = OptionParser(group.__doc__)
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    anchorfiles = args
    groups = Grouper()
    for anchorfile in anchorfiles:
        ac = AnchorFile(anchorfile)
        for a, b, idx in ac.iter_pairs():
            groups.join(a, b)

    logging.debug("Created {0} groups with {1} members.".\
                  format(len(groups), groups.num_members))

    outfile = opts.outfile
    fw = must_open(outfile, "w")
    for g in groups:
        print >> fw, ",".join(sorted(g))
    fw.close()

    return outfile
Beispiel #22
0
def filter(args):
    """
    %prog filter consensus.fasta

    Filter consensus sequence with min cluster size.
    """
    from jcvi.formats.fasta import Fasta, SeqIO

    p = OptionParser(filter.__doc__)
    p.add_option("--minsize", default=10, type="int",
                 help="Minimum cluster size")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    minsize = opts.minsize
    f = Fasta(fastafile, lazy=True)
    fw = must_open(opts.outfile, "w")
    for desc, rec in f.iterdescriptions_ordered():
        if desc.startswith("singleton"):
            continue
        # consensus_for_cluster_0 with 63 sequences
        name, w, size, seqs = desc.split()
        assert w == "with"
        size = int(size)
        if size < minsize:
            continue
        SeqIO.write(rec, fw, "fasta")
Beispiel #23
0
def fromimpute2(args):
    """
    %prog fromimpute2 impute2file fastafile 1

    Convert impute2 output to vcf file. Imputed file looks like:

    --- 1:10177:A:AC 10177 A AC 0.451 0.547 0.002
    """
    p = OptionParser(fromimpute2.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    impute2file, fastafile, chr = args
    fasta = Fasta(fastafile)
    print get_vcfstanza(fastafile, fasta)
    fp = open(impute2file)
    seen = set()
    for row in fp:
        snp_id, rsid, pos, ref, alt, aa, ab, bb = row.split()
        pos = int(pos)
        if pos in seen:
            continue
        seen.add(pos)
        code = max((float(aa), "0/0"), (float(ab), "0/1"), (float(bb), "1/1"))[-1]
        tag = "PR" if snp_id == chr else "IM"
        print "\t".join(str(x) for x in \
                (chr, pos, rsid, ref, alt, ".", ".", tag, \
                "GT:GP", code + ":" + ",".join((aa, ab, bb))))
Beispiel #24
0
def uniq(args):
    """
    %prog uniq vcffile

    Retain only the first entry in vcf file.
    """
    from urlparse import parse_qs

    p = OptionParser(uniq.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    vcffile, = args
    fp = must_open(vcffile)
    data = []
    for row in fp:
        if row[0] == '#':
            print row.strip()
            continue
        v = VcfLine(row)
        data.append(v)

    for pos, vv in groupby(data, lambda x: x.pos):
        vv = list(vv)
        if len(vv) == 1:
            print vv[0]
            continue
        bestv = max(vv, key=lambda x: float(parse_qs(x.info)["R2"][0]))
        print bestv
Beispiel #25
0
def sample(args):
    """
    %prog sample vcffile 0.9

    Sample subset of vcf file.
    """
    from random import random

    p = OptionParser(sample.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, ratio = args
    ratio = float(ratio)
    fp = open(vcffile)
    pf = vcffile.rsplit(".", 1)[0]
    kept = pf + ".kept.vcf"
    withheld = pf + ".withheld.vcf"
    fwk = open(kept, "w")
    fww = open(withheld, "w")
    nkept = nwithheld = 0
    for row in fp:
        if row[0] == '#':
            print >> fwk, row.strip()
            continue
        if random() < ratio:
            nkept += 1
            print >> fwk, row.strip()
        else:
            nwithheld += 1
            print >> fww, row.strip()
    logging.debug("{0} records kept to `{1}`".format(nkept, kept))
    logging.debug("{0} records withheld to `{1}`".format(nwithheld, withheld))
Beispiel #26
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Beispiel #27
0
def summary(args):
    """
    %prog summary old.new.chain old.fasta new.fasta

    Provide stats of the chain file.
    """
    from jcvi.formats.fasta import summary as fsummary
    from jcvi.utils.cbook import percentage, human_size

    p = OptionParser(summary.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    chainfile, oldfasta, newfasta = args
    chain = Chain(chainfile)
    ungapped, dt, dq = chain.ungapped, chain.dt, chain.dq
    print >> sys.stderr, "File `{0}` contains {1} chains.".\
                format(chainfile, len(chain))
    print >> sys.stderr, "ungapped={0} dt={1} dq={2}".\
                format(human_size(ungapped), human_size(dt), human_size(dq))

    oldreal, oldnn, oldlen = fsummary([oldfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "Old fasta (`{0}`) mapped: {1}".\
                format(oldfasta, percentage(ungapped, oldreal))

    newreal, newnn, newlen = fsummary([newfasta, "--outfile=/dev/null"])
    print >> sys.stderr, "New fasta (`{0}`) mapped: {1}".\
                format(newfasta, percentage(ungapped, newreal))
Beispiel #28
0
def uclust(args):
    """
    %prog uclust fastafile

    Use `usearch` to remove duplicate reads.
    """
    p = OptionParser(uclust.__doc__)
    p.set_align(pctid=98)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    pf, sf = fastafile.rsplit(".", 1)
    sortedfastafile = pf + ".sorted.fasta"
    if need_update(fastafile, sortedfastafile):
        cmd = "usearch -sortbylength {0} -fastaout {1}".\
                    format(fastafile, sortedfastafile)
        sh(cmd)

    pf = fastafile + ".P{0}.uclust".format(opts.pctid)
    clstrfile = pf + ".clstr"
    centroidsfastafile = pf + ".centroids.fasta"
    if need_update(sortedfastafile, centroidsfastafile):
        cmd = "usearch -cluster_smallmem {0}".format(sortedfastafile)
        cmd += " -id {0}".format(identity)
        cmd += " -uc {0} -centroids {1}".format(clstrfile, centroidsfastafile)
        sh(cmd)
Beispiel #29
0
def fromagp(args):
    """
    %prog fromagp agpfile componentfasta objectfasta

    Generate chain file from AGP format. The components represent the old
    genome (target) and the objects represent new genome (query).
    """
    from jcvi.formats.agp import AGP
    from jcvi.formats.sizes import Sizes

    p = OptionParser(fromagp.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    agpfile, componentfasta, objectfasta = args
    chainfile = agpfile.rsplit(".", 1)[0] + ".chain"
    fw = open(chainfile, "w")
    agp = AGP(agpfile)
    componentsizes = Sizes(componentfasta).mapping
    objectsizes = Sizes(objectfasta).mapping
    chain = "chain"
    score = 1000
    tStrand = "+"
    id = 0
    for a in agp:
        if a.is_gap:
            continue

        tName = a.component_id
        tSize = componentsizes[tName]
        tStart = a.component_beg
        tEnd = a.component_end
        tStart -= 1

        qName = a.object
        qSize = objectsizes[qName]
        qStrand = "-" if a.orientation == "-" else "+"
        qStart = a.object_beg
        qEnd = a.object_end
        if qStrand == '-':
            _qStart = qSize - qEnd + 1
            _qEnd = qSize - qStart + 1
            qStart, qEnd = _qStart, _qEnd
        qStart -= 1

        id += 1
        size = a.object_span
        headerline = "\t".join(str(x) for x in (
             chain, score, tName, tSize, tStrand, tStart,
             tEnd, qName, qSize, qStrand, qStart, qEnd, id
        ))
        alignmentline = size
        print >> fw, headerline
        print >> fw, alignmentline
        print >> fw

    fw.close()
    logging.debug("File written to `{0}`.".format(chainfile))
Beispiel #30
0
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Beispiel #31
0
def scaffold(args):
    """
    %prog scaffold ctgfasta reads1.fasta mapping1.bed
                            reads2.fasta mapping2.bed ...

    Run BAMBUS on set of contigs, reads and read mappings.
    """
    from more_itertools import grouper

    from jcvi.formats.base import FileMerger
    from jcvi.formats.bed import mates
    from jcvi.formats.contig import frombed
    from jcvi.formats.fasta import join

    p = OptionParser(scaffold.__doc__)
    p.set_rclip(rclip=1)
    p.add_option("--conf",
                 help="BAMBUS configuration file [default: %default]")
    p.add_option(
        "--prefix",
        default=False,
        action="store_true",
        help="Only keep links between IDs with same prefix [default: %default]",
    )
    opts, args = p.parse_args(args)

    nargs = len(args)
    if nargs < 3 or nargs % 2 != 1:
        sys.exit(not p.print_help())

    rclip = opts.rclip
    ctgfasta = args[0]
    duos = list(grouper(args[1:], 2))
    trios = []
    for fastafile, bedfile in duos:
        prefix = bedfile.rsplit(".", 1)[0]
        matefile = prefix + ".mates"
        matebedfile = matefile + ".bed"
        if need_update(bedfile, [matefile, matebedfile]):
            matesopt = [
                bedfile,
                "--lib",
                "--nointra",
                "--rclip={0}".format(rclip),
                "--cutoff={0}".format(opts.cutoff),
            ]
            if opts.prefix:
                matesopt += ["--prefix"]
            matefile, matebedfile = mates(matesopt)
        trios.append((fastafile, matebedfile, matefile))

    # Merge the readfasta, bedfile and matefile
    bbfasta, bbbed, bbmate = "bambus.reads.fasta", "bambus.bed", "bambus.mates"

    for files, outfile in zip(zip(*trios), (bbfasta, bbbed, bbmate)):
        FileMerger(files, outfile=outfile).merge(checkexists=True)

    ctgfile = "bambus.contig"
    idsfile = "bambus.ids"
    frombedInputs = [bbbed, ctgfasta, bbfasta]
    if need_update(frombedInputs, ctgfile):
        frombed(frombedInputs)

    inputfasta = "bambus.contigs.fasta"
    singletonfasta = "bambus.singletons.fasta"
    cmd = "faSomeRecords {0} {1} ".format(ctgfasta, idsfile)
    sh(cmd + inputfasta)
    sh(cmd + singletonfasta + " -exclude")

    # Run bambus
    prefix = "bambus"
    cmd = "goBambus -c {0} -m {1} -o {2}".format(ctgfile, bbmate, prefix)
    if opts.conf:
        cmd += " -C {0}".format(opts.conf)
    sh(cmd)

    cmd = "untangle -e {0}.evidence.xml -s {0}.out.xml -o {0}.untangle.xml".format(
        prefix)
    sh(cmd)

    final = "final"
    cmd = ("printScaff -e {0}.evidence.xml -s {0}.untangle.xml -l {0}.lib "
           "-merge -detail -oo -sum -o {1}".format(prefix, final))
    sh(cmd)

    oofile = final + ".oo"
    join([inputfasta, "--oo={0}".format(oofile)])
Beispiel #32
0
def calibrate(args):
    """
    %prog calibrate calibrate.JPG boxsize

    Calibrate pixel-inch ratio and color adjustment.
    - `calibrate.JPG` is the photo containig a colorchecker
    - `boxsize` is the measured size for the boxes on printed colorchecker, in
      squared centimeter (cm2) units
    """
    xargs = args[2:]
    p = OptionParser(calibrate.__doc__)
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    imagefile, boxsize = args
    boxsize = float(boxsize)

    # Read in color checker
    colorcheckerfile = op.join(datadir, "colorchecker.txt")
    colorchecker = []
    expected = 0
    for row in open(colorcheckerfile):
        boxes = row.split()
        colorchecker.append(boxes)
        expected += len(boxes)

    folder = op.split(imagefile)[0]
    objects = seeds([imagefile, "--outdir={0}".format(folder)] + xargs)
    nseeds = len(objects)
    logging.debug("Found {0} boxes (expected={1})".format(nseeds, expected))
    assert expected - 4 <= nseeds <= expected + 4, \
                "Number of boxes drastically different from {0}".format(expected)

    # Calculate pixel-cm ratio
    boxes = [t.area for t in objects]
    reject = reject_outliers(boxes)
    retained_boxes = [b for r, b in zip(reject, boxes) if not r]
    mbox = np.median(retained_boxes)  # in pixels
    pixel_cm_ratio = (mbox / boxsize) ** .5
    logging.debug("Median box size: {0} pixels. Measured box size: {1} cm2".\
                format(mbox, boxsize))
    logging.debug("Pixel-cm ratio: {0}".format(pixel_cm_ratio))

    xs = [t.x for t in objects]
    ys = [t.y for t in objects]
    idx_xs = get_kmeans(xs, 6)
    idx_ys = get_kmeans(ys, 4)
    for xi, yi, s in zip(idx_xs, idx_ys, objects):
        s.rank = (yi, xi)

    objects.sort(key=lambda x: x.rank)

    colormap = []
    for s in objects:
        x, y = s.rank
        observed, expected = s.rgb, rgb_to_triplet(colorchecker[x][y])
        colormap.append((np.array(observed), np.array(expected)))

    # Color transfer
    tr0 = np.eye(3).flatten()
    print >> sys.stderr, "Initial distance:", total_error(tr0, colormap)
    tr = fmin(total_error, tr0, args=(colormap,))
    tr.resize((3, 3))
    print >> sys.stderr, "RGB linear transform:\n", tr
    calib = {"PixelCMratio": pixel_cm_ratio,
             "RGBtransform": tr.tolist()}

    jsonfile = op.join(folder, "calibrate.json")
    fw = must_open(jsonfile, "w")
    print >> fw, json.dumps(calib, indent=4)
    fw.close()
    logging.debug("Calibration specs written to `{0}`.".format(jsonfile))

    return jsonfile
Beispiel #33
0
def A50(args):
    """
    %prog A50 contigs_A.fasta contigs_B.fasta ...

    Plots A50 graphics, see blog post (http://blog.malde.org/index.php/a50/)
    """
    p = OptionParser(A50.__doc__)
    p.add_option(
        "--overwrite",
        default=False,
        action="store_true",
        help="overwrite .rplot file if exists",
    )
    p.add_option(
        "--cutoff",
        default=0,
        type="int",
        dest="cutoff",
        help="use contigs above certain size",
    )
    p.add_option(
        "--stepsize",
        default=10,
        type="int",
        dest="stepsize",
        help="stepsize for the distribution",
    )
    opts, args = p.parse_args(args)

    if not args:
        sys.exit(p.print_help())

    import numpy as np
    from jcvi.utils.table import loadtable

    stepsize = opts.stepsize  # use stepsize to speed up drawing
    rplot = "A50.rplot"
    if not op.exists(rplot) or opts.overwrite:
        fw = open(rplot, "w")
        header = "\t".join(("index", "cumsize", "fasta"))
        statsheader = ("Fasta", "L50", "N50", "Min", "Max", "Average", "Sum",
                       "Counts")
        statsrows = []
        print(header, file=fw)
        for fastafile in args:
            f = Fasta(fastafile, index=False)
            ctgsizes = [length for k, length in f.itersizes()]
            ctgsizes = np.array(ctgsizes)

            a50, l50, n50 = calculate_A50(ctgsizes, cutoff=opts.cutoff)
            cmin, cmax, cmean = min(ctgsizes), max(ctgsizes), np.mean(ctgsizes)
            csum, counts = np.sum(ctgsizes), len(ctgsizes)
            cmean = int(round(cmean))
            statsrows.append(
                (fastafile, l50, n50, cmin, cmax, cmean, csum, counts))

            logging.debug("`{0}` ctgsizes: {1}".format(fastafile, ctgsizes))

            tag = "{0} (L50={1})".format(
                op.basename(fastafile).rsplit(".", 1)[0], l50)
            logging.debug(tag)

            for i, s in zip(range(0, len(a50), stepsize), a50[::stepsize]):
                print("\t".join((str(i), str(s / 1000000.0), tag)), file=fw)
        fw.close()

        table = loadtable(statsheader, statsrows)
        print(table, file=sys.stderr)

    generate_plot(rplot)
Beispiel #34
0
def covlen(args):
    """
    %prog covlen covfile fastafile

    Plot coverage vs length. `covfile` is two-column listing contig id and
    depth of coverage.
    """
    import numpy as np
    import pandas as pd
    import seaborn as sns
    from jcvi.formats.base import DictFile

    p = OptionParser(covlen.__doc__)
    p.add_option("--maxsize",
                 default=1000000,
                 type="int",
                 help="Max contig size")
    p.add_option("--maxcov", default=100, type="int", help="Max contig size")
    p.add_option("--color", default="m", help="Color of the data points")
    p.add_option(
        "--kind",
        default="scatter",
        choices=("scatter", "reg", "resid", "kde", "hex"),
        help="Kind of plot to draw",
    )
    opts, args, iopts = p.set_image_options(args, figsize="8x8")

    if len(args) != 2:
        sys.exit(not p.print_help())

    covfile, fastafile = args
    cov = DictFile(covfile, cast=float)
    s = Sizes(fastafile)
    data = []
    maxsize, maxcov = opts.maxsize, opts.maxcov
    for ctg, size in s.iter_sizes():
        c = cov.get(ctg, 0)
        if size > maxsize:
            continue
        if c > maxcov:
            continue
        data.append((size, c))

    x, y = zip(*data)
    x = np.array(x)
    y = np.array(y)
    logging.debug("X size {0}, Y size {1}".format(x.size, y.size))

    df = pd.DataFrame()
    xlab, ylab = "Length", "Coverage of depth (X)"
    df[xlab] = x
    df[ylab] = y
    sns.jointplot(
        xlab,
        ylab,
        kind=opts.kind,
        data=df,
        xlim=(0, maxsize),
        ylim=(0, maxcov),
        stat_func=None,
        edgecolor="w",
        color=opts.color,
    )

    figname = covfile + ".pdf"
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Beispiel #35
0
def qc(args):
    """
    %prog qc prefix

    Expects data files including:
    1. `prefix.bedpe` draws Bezier curve between paired reads
    2. `prefix.sizes` draws length of the contig/scaffold
    3. `prefix.gaps.bed` mark the position of the gaps in sequence
    4. `prefix.bed.coverage` plots the base coverage
    5. `prefix.pairs.bed.coverage` plots the clone coverage

    See assembly.coverage.posmap() for the generation of these files.
    """
    from jcvi.graphics.glyph import Bezier

    p = OptionParser(qc.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    (prefix, ) = args
    scf = prefix

    # All these files *must* be present in the current folder
    bedpefile = prefix + ".bedpe"
    fastafile = prefix + ".fasta"
    sizesfile = prefix + ".sizes"
    gapsbedfile = prefix + ".gaps.bed"
    bedfile = prefix + ".bed"
    bedpefile = prefix + ".bedpe"
    pairsbedfile = prefix + ".pairs.bed"

    sizes = Sizes(fastafile).mapping
    size = sizes[scf]

    fig = plt.figure(1, (8, 5))
    root = fig.add_axes([0, 0, 1, 1])

    # the scaffold
    root.add_patch(Rectangle((0.1, 0.15), 0.8, 0.03, fc="k"))

    # basecoverage and matecoverage
    ax = fig.add_axes([0.1, 0.45, 0.8, 0.45])

    bins = 200  # Smooth the curve
    basecoverage = Coverage(bedfile, sizesfile)
    matecoverage = Coverage(pairsbedfile, sizesfile)

    x, y = basecoverage.get_plot_data(scf, bins=bins)
    (baseline, ) = ax.plot(x, y, "g-")
    x, y = matecoverage.get_plot_data(scf, bins=bins)
    (mateline, ) = ax.plot(x, y, "r-")
    legends = ("Base coverage", "Mate coverage")
    leg = ax.legend((baseline, mateline), legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)
    ax.set_xlim(0, size)

    # draw the read pairs
    fp = open(bedpefile)
    pairs = []
    for row in fp:
        scf, astart, aend, scf, bstart, bend, clonename = row.split()
        astart, bstart = int(astart), int(bstart)
        aend, bend = int(aend), int(bend)
        start = min(astart, bstart) + 1
        end = max(aend, bend)
        pairs.append((start, end))

    bpratio = 0.8 / size
    cutoff = 1000  # inserts smaller than this are not plotted
    # this convert from base => x-coordinate
    pos = lambda x: (0.1 + x * bpratio)
    ypos = 0.15 + 0.03
    for start, end in pairs:
        dist = end - start

        if dist < cutoff:
            continue

        dist = min(dist, 10000)
        # 10Kb == .25 canvas height
        height = 0.25 * dist / 10000
        xstart = pos(start)
        xend = pos(end)
        p0 = (xstart, ypos)
        p1 = (xstart, ypos + height)
        p2 = (xend, ypos + height)
        p3 = (xend, ypos)
        Bezier(root, p0, p1, p2, p3)

    # gaps on the scaffold
    fp = open(gapsbedfile)
    for row in fp:
        b = BedLine(row)
        start, end = b.start, b.end
        xstart = pos(start)
        xend = pos(end)
        root.add_patch(Rectangle((xstart, 0.15), xend - xstart, 0.03, fc="w"))

    root.text(0.5, 0.1, scf, color="b", ha="center")
    warn_msg = "Only the inserts > {0}bp are shown".format(cutoff)
    root.text(0.5, 0.1, scf, color="b", ha="center")
    root.text(0.5, 0.05, warn_msg, color="gray", ha="center")
    # clean up and output
    set_human_base_axis(ax)
    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    figname = prefix + ".pdf"
    savefig(figname, dpi=300)
Beispiel #36
0
def scaffold(args):
    """
    %prog scaffold scaffold.fasta synteny.blast synteny.sizes synteny.bed
                         physicalmap.blast physicalmap.sizes physicalmap.bed

    As evaluation of scaffolding, visualize external line of evidences:
    * Plot synteny to an external genome
    * Plot alignments to physical map
    * Plot alignments to genetic map (TODO)

    Each trio defines one panel to be plotted. blastfile defines the matchings
    between the evidences vs scaffolds. Then the evidence sizes, and evidence
    bed to plot dot plots.

    This script will plot a dot in the dot plot in the corresponding location
    the plots are one contig/scaffold per plot.
    """
    from jcvi.utils.iter import grouper

    p = OptionParser(scaffold.__doc__)
    p.add_option(
        "--cutoff",
        type="int",
        default=1000000,
        help="Plot scaffolds with size larger than",
    )
    p.add_option(
        "--highlights",
        help="A set of regions in BED format to highlight",
    )
    opts, args, iopts = p.set_image_options(args, figsize="14x8", dpi=150)

    if len(args) < 4 or len(args) % 3 != 1:
        sys.exit(not p.print_help())

    highlights = opts.highlights
    scafsizes = Sizes(args[0])
    trios = list(grouper(args[1:], 3))
    trios = [(a, Sizes(b), Bed(c)) for a, b, c in trios]
    if highlights:
        hlbed = Bed(highlights)

    for scaffoldID, scafsize in scafsizes.iter_sizes():
        if scafsize < opts.cutoff:
            continue
        logging.debug("Loading {0} (size={1})".format(scaffoldID,
                                                      thousands(scafsize)))

        tmpname = scaffoldID + ".sizes"
        tmp = open(tmpname, "w")
        tmp.write("{0}\t{1}".format(scaffoldID, scafsize))
        tmp.close()

        tmpsizes = Sizes(tmpname)
        tmpsizes.close(clean=True)

        if highlights:
            subhighlights = list(hlbed.sub_bed(scaffoldID))

        imagename = ".".join((scaffoldID, opts.format))
        plot_one_scaffold(
            scaffoldID,
            tmpsizes,
            None,
            trios,
            imagename,
            iopts,
            highlights=subhighlights,
        )
Beispiel #37
0
def coverage(args):
    """
    %prog coverage fastafile ctg bedfile1 bedfile2 ..

    Plot coverage from a set of BED files that contain the read mappings. The
    paired read span will be converted to a new bedfile that contain the happy
    mates. ctg is the chr/scf/ctg that you want to plot the histogram on.

    If the bedfiles already contain the clone spans, turn on --spans.
    """
    from jcvi.formats.bed import mates, bedpe

    p = OptionParser(coverage.__doc__)
    p.add_option("--ymax", default=None, type="int", help="Limit ymax")
    p.add_option(
        "--spans",
        default=False,
        action="store_true",
        help="BED files already contain clone spans",
    )
    opts, args, iopts = p.set_image_options(args, figsize="8x5")

    if len(args) < 3:
        sys.exit(not p.print_help())

    fastafile, ctg = args[0:2]
    bedfiles = args[2:]

    sizes = Sizes(fastafile)
    size = sizes.mapping[ctg]

    plt.figure(1, (iopts.w, iopts.h))
    ax = plt.gca()

    bins = 100  # smooth the curve
    lines = []
    legends = []
    not_covered = []
    yy = 0.9
    for bedfile, c in zip(bedfiles, "rgbcky"):
        if not opts.spans:
            pf = bedfile.rsplit(".", 1)[0]
            matesfile = pf + ".mates"
            if need_update(bedfile, matesfile):
                matesfile, matesbedfile = mates([bedfile, "--lib"])

            bedspanfile = pf + ".spans.bed"
            if need_update(matesfile, bedspanfile):
                bedpefile, bedspanfile = bedpe(
                    [bedfile, "--span", "--mates={0}".format(matesfile)])
            bedfile = bedspanfile

        bedsum = Bed(bedfile).sum(seqid=ctg)
        notcoveredbases = size - bedsum

        legend = bedfile.split(".")[0]
        msg = "{0}: {1} bp not covered".format(legend,
                                               thousands(notcoveredbases))
        not_covered.append(msg)
        print(msg, file=sys.stderr)
        ax.text(0.1, yy, msg, color=c, size=9, transform=ax.transAxes)
        yy -= 0.08

        cov = Coverage(bedfile, sizes.filename)
        x, y = cov.get_plot_data(ctg, bins=bins)
        (line, ) = ax.plot(x, y, "-", color=c, lw=2, alpha=0.5)
        lines.append(line)
        legends.append(legend)

    leg = ax.legend(lines, legends, shadow=True, fancybox=True)
    leg.get_frame().set_alpha(0.5)

    ylabel = "Average depth per {0}Kb".format(size / bins / 1000)
    ax.set_xlim(0, size)
    ax.set_ylim(0, opts.ymax)
    ax.set_xlabel(ctg)
    ax.set_ylabel(ylabel)
    set_human_base_axis(ax)

    figname = "{0}.{1}.pdf".format(fastafile, ctg)
    savefig(figname, dpi=iopts.dpi, iopts=iopts)
Beispiel #38
0
def subset(args):
    """
    %prog subset pairsfile ksfile1 ksfile2 ... -o pairs.ks

    Subset some pre-calculated ks ka values (in ksfile) according to pairs
    in tab delimited pairsfile/anchorfile.
    """
    p = OptionParser(subset.__doc__)
    p.add_option(
        "--noheader", action="store_true", help="don't write ksfile header line"
    )
    p.add_option(
        "--block", action="store_true", help="preserve block structure in input"
    )
    p.set_stripnames()
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, ksfiles = args[0], args[1:]
    noheader = opts.noheader
    block = opts.block
    if block:
        noheader = True
    outfile = opts.outfile

    ksvals = {}
    for ksfile in ksfiles:
        ksvals.update(
            dict(
                (line.name, line)
                for line in KsFile(ksfile, strip_names=opts.strip_names)
            )
        )

    fp = open(pairsfile)
    fw = must_open(outfile, "w")

    if not noheader:
        print(fields, file=fw)

    i = j = 0
    for row in fp:
        if row[0] == "#":
            if block:
                print(row.strip(), file=fw)
            continue
        a, b = row.split()[:2]
        name = ";".join((a, b))
        if name not in ksvals:
            name = ";".join((b, a))
            if name not in ksvals:
                j += 1
                print("\t".join((a, b, ".", ".")), file=fw)
                continue
        ksline = ksvals[name]
        if block:
            print("\t".join(str(x) for x in (a, b, ksline.ks)), file=fw)
        else:
            ksline.name = ";".join((a, b))
            print(ksline, file=fw)
        i += 1
    fw.close()

    logging.debug("{0} pairs not found in ksfiles".format(j))
    logging.debug("{0} ks records written to `{1}`".format(i, outfile))
    return outfile
Beispiel #39
0
def report(args):
    """
    %prog report ksfile

    generate a report given a Ks result file (as produced by synonymous_calc.py).
    describe the median Ks, Ka values, as well as the distribution in stem-leaf plot
    """
    from jcvi.utils.cbook import SummaryStats
    from jcvi.graphics.histogram import stem_leaf_plot

    p = OptionParser(report.__doc__)
    p.add_option(
        "--pdf",
        default=False,
        action="store_true",
        help="Generate graphic output for the histogram",
    )
    p.add_option(
        "--components",
        default=1,
        type="int",
        help="Number of components to decompose peaks",
    )
    add_plot_options(p)
    opts, args, iopts = p.set_image_options(args, figsize="5x5")

    if len(args) != 1:
        sys.exit(not p.print_help())

    (ks_file,) = args
    data = KsFile(ks_file)
    ks_min = opts.vmin
    ks_max = opts.vmax
    bins = opts.bins

    for f in fields.split(",")[1:]:
        columndata = [getattr(x, f) for x in data]
        ks = "ks" in f
        if not ks:
            continue

        columndata = [x for x in columndata if ks_min <= x <= ks_max]

        st = SummaryStats(columndata)
        title = "{0} ({1}): ".format(descriptions[f], ks_file)
        title += "Median:{0:.3f} (1Q:{1:.3f}|3Q:{2:.3f}||".format(
            st.median, st.firstq, st.thirdq
        )
        title += "Mean:{0:.3f}|Std:{1:.3f}||N:{2})".format(st.mean, st.sd, st.size)

        tbins = (0, ks_max, bins) if ks else (0, 0.6, 10)
        digit = 2 if (ks_max * 1.0 / bins) < 0.1 else 1
        stem_leaf_plot(columndata, *tbins, digit=digit, title=title)

    if not opts.pdf:
        return

    components = opts.components
    data = [x.ng_ks for x in data]
    data = [x for x in data if ks_min <= x <= ks_max]

    fig = plt.figure(1, (iopts.w, iopts.h))
    ax = fig.add_axes([0.12, 0.1, 0.8, 0.8])
    kp = KsPlot(ax, ks_max, opts.bins, legendp=opts.legendp)
    kp.add_data(data, components, fill=opts.fill, fitted=opts.fit, kde=opts.kde)
    kp.draw(title=opts.title)
Beispiel #40
0
def last(args, dbtype=None):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option(
        "--dbtype",
        default="nucl",
        choices=("nucl", "prot"),
        help="Molecule type of subject database",
    )
    p.add_option("--path", help="Specify LAST path")
    p.add_option(
        "--mask", default=False, action="store_true", help="Invoke -c in lastdb"
    )
    p.add_option(
        "--format",
        default="BlastTab",
        choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
        help="Output format",
    )
    p.add_option(
        "--minlen",
        default=0,
        type="int",
        help="Filter alignments by how many bases match",
    )
    p.add_option("--minid", default=0, type="int", help="Minimum sequence identity")
    p.set_cpus()
    p.set_outdir()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    if not dbtype:
        dbtype = opts.dbtype
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(
        infile=subject,
        outfile=subjectdb + ".prj",
        mask=opts.mask,
        lastdb_bin=lastdb_bin,
        dbtype=dbtype,
    )

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last", outdir=opts.outdir)
    sh(cmd, outfile=lastfile)
    return lastfile
Beispiel #41
0
def consolidate(args):
    """
    %prog consolidate gffile1 gffile2 ... > consolidated.out

    Given 2 or more gff files generated by pasa annotation comparison,
    iterate through every gene locus and identify all cases of same and
    different isoforms across the different input datasets.
    """
    from jcvi.formats.base import longest_unique_prefix
    from jcvi.formats.gff import make_index
    from jcvi.utils.cbook import AutoVivification
    from jcvi.utils.grouper import Grouper
    from itertools import combinations, product

    p = OptionParser(consolidate.__doc__)
    p.add_option("--slop", default=False, action="store_true",
            help="allow minor variation in terminal 5'/3' UTR" + \
                 " start/stop position [default: %default]")
    p.set_outfile()

    opts, args = p.parse_args(args)
    slop = opts.slop

    if len(args) < 2:
        sys.exit(not p.print_help())

    gffdbx = {}
    gene_coords = {}
    mrna = AutoVivification()
    for gffile in args:
        dbn = longest_unique_prefix(gffile, args)
        gffdbx[dbn] = make_index(gffile)
        for gene in gffdbx[dbn].features_of_type('gene',
                                                 order_by=('seqid', 'start')):
            if gene.id not in gene_coords:
                gene_coords[gene.id] = []
            gene_coords[gene.id].extend([gene.start, gene.stop])

            c = list(gffdbx[dbn].children(gene,
                                          featuretype='mRNA',
                                          order_by='start'))
            if len(c) > 0:
                mrna[gene.id][dbn] = c

    fw = must_open(opts.outfile, "w")
    print >> fw, "##gff-version	3"
    summary = ["id"]
    summary.extend(gffdbx.keys())
    print >> sys.stderr, "\t".join(str(x) for x in summary)
    for gene in mrna:
        g = Grouper()
        dbns = list(combinations(mrna[gene], 2))
        if len(dbns) > 0:
            for dbn1, dbn2 in dbns:
                for mrna1, mrna2 in product(mrna[gene][dbn1],
                                            mrna[gene][dbn2]):
                    g.join((dbn1, mrna1.id))
                    g.join((dbn2, mrna2.id))

                    fUTR, tUTR = None, None
                    if match_subfeats(mrna1, mrna2, gffdbx[dbn1],
                                      gffdbx[dbn2]):
                        fUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='five_prime_UTR', slop=slop)
                        tUTR = match_subfeats(mrna1, mrna2, gffdbx[dbn1], gffdbx[dbn2], \
                                featuretype='three_prime_UTR', slop=slop)

                    if fUTR and tUTR:
                        g.join((dbn1, mrna1.id), (dbn2, mrna2.id))
        else:
            for dbn1 in mrna[gene]:
                for mrna1 in mrna[gene][dbn1]:
                    g.join((dbn1, mrna1.id))

        dbn = mrna[gene].keys()[0]
        gene_coords[gene].sort()
        _gene = gffdbx[dbn][gene]
        _gene.start, _gene.stop = gene_coords[gene][0], gene_coords[gene][-1]
        print >> fw, _gene

        logging.debug(list(g))
        for group in g:
            dbs, mrnas = [el[0] for el in group], [el[1] for el in group]
            d, m = dbs[0], mrnas[0]
            if slop:
                mlen = 0
                for D, M in zip(dbs, mrnas):
                    _mrna = gffdbx[D][M]
                    _mlen = (_mrna.stop - _mrna.start) + 1
                    if _mlen > mlen:
                        d, m, mlen = D, M, _mlen

            dbid, _mrnaid = "".join(str(x) for x in set(dbs)), []
            _mrnaid = [x for x in mrnas if x not in _mrnaid]
            mrnaid = "{0}:{1}".format(dbid, "-".join(_mrnaid))

            _mrna = gffdbx[d][m]
            _mrna.attributes['ID'] = [mrnaid]
            children = gffdbx[d].children(m, order_by='start')
            print >> fw, _mrna
            for child in children:
                child.attributes['ID'] = ["{0}:{1}".format(dbid, child.id)]
                child.attributes['Parent'] = [mrnaid]
                print >> fw, child

            summary = [mrnaid]
            summary.extend(['Y' if db in set(dbs) else 'N' for db in gffdbx])
            print >> sys.stderr, "\t".join(str(x) for x in summary)

    fw.close()
Beispiel #42
0
def calc(args):
    """
    %prog calc [prot.fasta] cds.fasta > out.ks

    Protein file is optional. If only one file is given, it is assumed to
    be CDS sequences with correct frame (frame 0). Results will be written to
    stdout. Both protein file and nucleotide file are assumed to be Fasta format,
    with adjacent records as the pairs to compare.

    Author: Haibao Tang <*****@*****.**>, Brad Chapman, Jingping Li
    Calculate synonymous mutation rates for gene pairs

    This does the following:
        1. Fetches a protein pair.
        2. Aligns the protein pair with clustalw (default) or muscle.
        3. Convert the output to Fasta format.
        4. Use this alignment info to align gene sequences using PAL2NAL
        5. Run PAML yn00 to calculate synonymous mutation rates.
    """
    from jcvi.formats.fasta import translate

    p = OptionParser(calc.__doc__)
    p.add_option(
        "--longest",
        action="store_true",
        help="Get longest ORF, only works if no pep file, e.g. ESTs",
    )
    p.add_option(
        "--msa",
        default="clustalw",
        choices=("clustalw", "muscle"),
        help="software used to align the proteins",
    )
    p.add_option("--workdir", default=os.getcwd(), help="Work directory")
    p.set_outfile()

    opts, args = p.parse_args(args)

    if len(args) == 1:
        protein_file, dna_file = None, args[0]
    elif len(args) == 2:
        protein_file, dna_file = args
    else:
        print("Incorrect arguments", file=sys.stderr)
        sys.exit(not p.print_help())

    output_h = must_open(opts.outfile, "w")
    print(fields, file=output_h)
    work_dir = op.join(opts.workdir, "syn_analysis")
    mkdir(work_dir)

    if not protein_file:
        protein_file = dna_file + ".pep"
        translate_args = [dna_file, "--outfile=" + protein_file]
        if opts.longest:
            translate_args += ["--longest"]
        dna_file, protein_file = translate(translate_args)

    prot_iterator = SeqIO.parse(open(protein_file), "fasta")
    dna_iterator = SeqIO.parse(open(dna_file), "fasta")
    for p_rec_1, p_rec_2, n_rec_1, n_rec_2 in zip(
        prot_iterator, prot_iterator, dna_iterator, dna_iterator
    ):

        print("--------", p_rec_1.name, p_rec_2.name, file=sys.stderr)
        if opts.msa == "clustalw":
            align_fasta = clustal_align_protein((p_rec_1, p_rec_2), work_dir)
        elif opts.msa == "muscle":
            align_fasta = muscle_align_protein((p_rec_1, p_rec_2), work_dir)
        mrtrans_fasta = run_mrtrans(align_fasta, (n_rec_1, n_rec_2), work_dir)
        if mrtrans_fasta:
            ds_subs_yn, dn_subs_yn, ds_subs_ng, dn_subs_ng = find_synonymous(
                mrtrans_fasta, work_dir
            )
            if ds_subs_yn is not None:
                pair_name = "%s;%s" % (p_rec_1.name, p_rec_2.name)
                output_h.write(
                    "%s\n"
                    % (
                        ",".join(
                            str(x)
                            for x in (
                                pair_name,
                                ds_subs_yn,
                                dn_subs_yn,
                                ds_subs_ng,
                                dn_subs_ng,
                            )
                        )
                    )
                )
                output_h.flush()

    # Clean-up
    sh("rm -rf 2YN.t 2YN.dN 2YN.dS rst rub rst1 syn_analysis")
Beispiel #43
0
def compare(args):
    """
    %prog compare pasa_db_name genome.fasta transcripts.fasta [annotation.gff]

    Run the PASA annotation comparison pipeline

    If annotation.gff file is provided, the PASA database is loaded with the annotations
    first before starting annotation comparison. Otherwise, it uses previously
    loaded annotation data.

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(compare.__doc__)
    p.set_pasa_opts(action="compare")
    p.add_option(
        "--prepare",
        default=False,
        action="store_true",
        help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, transcripts, = args[:3]
    annotation = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error(
            "PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))

    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"

    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    acfw = must_open(acconf, "w")
    print >> acfw, annotCompare_conf.format("{0}_pasa".format(pasa_db), \
            opts.pctovl, opts.pct_coding, opts.pctid_prot, opts.pctlen_FL, \
            opts.pctlen_nonFL, opts.orf_size, opts.pct_aln, opts.pctovl_gene, \
            opts.stompovl, opts.trust_FL, opts.utr_exons)
    acfw.close()

    if op.exists("{0}.clean".format(transcripts)):
        transcripts = "{0}.clean".format(transcripts)

    accmd = "{0} -c {1} -A -g {2} -t {3} --GENETIC_CODE {4}".format(launch_pasa, \
            acconf, genome, transcripts, opts.genetic_code)
    if annotation:
        accmd += " -L --annots_gff3 {0}".format(annotation)
    if prepare:
        write_file(runfile, accmd, append=True)
    else:
        sh(accmd, grid=grid, grid_opts=opts)
Beispiel #44
0
def assemble(args):
    """
    %prog assemble pasa_db_name genome.fasta transcripts-dn.fasta [transcript-gg.fasta]

    Run the PASA alignment assembly pipeline

    If two transcript fasta files (Trinity denovo and genome guided) are provided
    and the `--compreh` param is enabled, the PASA Comprehensive Transcriptome DB
    protocol is followed <http://pasa.sourceforge.net/#A_ComprehensiveTranscriptome>

    Using the `--prepare` option creates a shell script with the run commands without
    executing the pipeline
    """
    p = OptionParser(assemble.__doc__)
    p.set_pasa_opts()
    p.add_option(
        "--prepare",
        default=False,
        action="store_true",
        help="Prepare PASA run script with commands [default: %default]")
    p.set_grid()
    p.set_grid_opts()
    opts, args = p.parse_args(args)

    if len(args) not in (3, 4):
        sys.exit(not p.print_help())

    pasa_db, genome, dnfasta, = args[:3]
    ggfasta = args[3] if len(args) == 4 else None

    PASA_HOME = opts.pasa_home
    if not op.isdir(PASA_HOME):
        logging.error(
            "PASA_HOME={0} directory does not exist".format(PASA_HOME))
        sys.exit()

    aligners = opts.aligners.split(",")
    for aligner in aligners:
        if aligner not in ALLOWED_ALIGNERS:
            logging.error("Error: Unknown aligner `{0}`".format(aligner))
            logging.error("Can be any of {0}, ".format("|".join(ALLOWED_ALIGNERS)) + \
                    "combine multiple aligners in list separated by comma")
            sys.exit()

    clean = opts.clean
    seqclean = op.join(opts.tgi_home, "seqclean")

    accn_extract = which(op.join(PASA_HOME, "misc_utilities", \
            "accession_extractor.pl"))
    launch_pasa = which(op.join(PASA_HOME, "scripts", \
            "Launch_PASA_pipeline.pl"))
    build_compreh_trans = which(op.join(PASA_HOME, "scripts", \
            "build_comprehensive_transcriptome.dbi"))

    cpus = opts.cpus
    grid = opts.grid
    prepare, runfile = opts.prepare, "run.sh"
    pctcov, pctid = opts.pctcov, opts.pctid
    compreh_pctcov, bpsplice = opts.compreh_pctcov, opts.bpsplice

    mkdir(pasa_db)
    os.chdir(pasa_db)

    if prepare:
        write_file(runfile, "")  # initialize run script

    if ggfasta:
        transcripts = FileMerger([dnfasta, ggfasta], tfasta).merge()
        accn_extract_cmd = "cat {0} | {1} > {2}".format(
            dnfasta, accn_extract, tdn)
        write_file(runfile, accn_extract_cmd, append=True) \
                if prepare else sh(accn_extract_cmd)
    else:
        transcripts = dnfasta

    if opts.grid and not opts.threaded:
        opts.threaded = opts.cpus

    prjobid = None
    if clean:
        cleancmd = "{0} {1} -c {2} -l 60".format(seqclean, transcripts, cpus)
        if prepare:
            write_file(runfile, cleancmd, append=True)
        else:
            prjobid = sh(cleancmd, grid=grid, grid_opts=opts)

    aafw = must_open(aaconf, "w")
    print >> aafw, alignAssembly_conf.format("{0}_pasa".format(pasa_db), \
            pctcov, pctid, bpsplice)
    aafw.close()

    aacmd = "{0} -c {1} -C -R -g {2}".format(launch_pasa, aaconf, genome)
    aacmd += " -t {0}.clean -T -u {0} ".format(transcripts) if clean else \
             " -t {0} ".format(transcripts)
    if ggfasta:
        aacmd += " --TDN {0} ".format(tdn)
    aacmd += " --ALIGNERS {0} -I {1} --CPU {2}".format(",".join(aligners), \
            opts.intron, cpus)

    if prepare:
        write_file(runfile, aacmd, append=True)
    else:
        opts.hold_jid = prjobid
        prjobid = sh(aacmd, grid=grid, grid_opts=opts)

    if opts.compreh and ggfasta:
        comprehcmd = "{0} -c {1} -t {2}".format(build_compreh_trans, aaconf,
                                                transcripts)
        comprehcmd += " --min_per_ID {0} --min_per_aligned {1}".format(
            compreh_pctid, compreh_pctcov)

        if prepare:
            write_file(runfile, comprehcmd, append=True)
        else:
            opts.hold_jid = prjobid
            prjobid = sh(comprehcmd, grid=grid, grid_opts=opts)
Beispiel #45
0
def htg(args):
    """
    %prog htg fastafile template.sbt

    Prepare sqnfiles for Genbank HTG submission to update existing records.

    `fastafile` contains the records to update, multiple records are allowed
    (with each one generating separate sqn file in the sqn/ folder). The record
    defline has the accession ID. For example,
    >AC148290.3

    Internally, this generates two additional files (phasefile and namesfile)
    and download records from Genbank. Below is implementation details:

    `phasefile` contains, for each accession, phase information. For example:
    AC148290.3      3       HTG     2       mth2-45h12

    which means this is a Phase-3 BAC. Record with only a single contig will be
    labeled as Phase-3 regardless of the info in the `phasefile`. Template file
    is the Genbank sbt template. See jcvi.formats.sbt for generation of such
    files.

    Another problem is that Genbank requires the name of the sequence to stay
    the same when updating and will kick back with a table of name conflicts.
    For example:

    We are unable to process the updates for these entries
    for the following reason:

    Seqname has changed

    Accession Old seq_name New seq_name
    --------- ------------ ------------
    AC239792 mtg2_29457 AC239792.1

    To prepare a submission, this script downloads genbank and asn.1 format,
    and generate the phase file and the names file (use formats.agp.phase() and
    apps.gbsubmit.asn(), respectively). These get automatically run.

    However, use --phases if the genbank files contain outdated information.
    For example, the clone name changes or phase upgrades. In this case, run
    formats.agp.phase() manually, modify the phasefile and use --phases to override.
    """
    from jcvi.formats.fasta import sequin, ids
    from jcvi.formats.agp import phase
    from jcvi.apps.entrez import fetch

    p = OptionParser(htg.__doc__)
    p.add_option("--phases",
                 default=None,
                 help="Use another phasefile to override [default: %default]")
    p.add_option("--comment",
                 default="",
                 help="Comments for this update [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, sbtfile = args
    pf = fastafile.rsplit(".", 1)[0]

    idsfile = pf + ".ids"
    phasefile = pf + ".phases"
    namesfile = pf + ".names"

    ids([fastafile, "--outfile={0}".format(idsfile)])

    asndir = "asn.1"
    mkdir(asndir)
    fetch([idsfile, "--format=asn.1", "--outdir={0}".format(asndir)])
    asn(glob("{0}/*".format(asndir)) + \
            ["--outfile={0}".format(namesfile)])

    if opts.phases is None:
        gbdir = "gb"
        mkdir(gbdir)
        fetch([idsfile, "--format=gb", "--outdir={0}".format(gbdir)])
        phase(glob("{0}/*".format(gbdir)) + \
                ["--outfile={0}".format(phasefile)])
    else:
        phasefile = opts.phases

    assert op.exists(namesfile) and op.exists(phasefile)

    newphasefile = phasefile + ".new"
    newphasefw = open(newphasefile, "w")
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    from jcvi.graphics.histogram import stem_leaf_plot

    names = DictFile(namesfile)
    assert len(set(names.keys())) == len(set(names.values()))

    phases = DictFile(phasefile)
    ph = [int(x) for x in phases.values()]
    # vmin 1, vmax 4, bins 3
    stem_leaf_plot(ph, 1, 4, 3, title="Counts of phases before updates")
    logging.debug("Information loaded for {0} records.".format(len(phases)))
    assert len(names) == len(phases)

    newph = []

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "{qualifiers}"'
    acmd += ' -A {accession_nv} -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    qq = "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"

    nupdated = 0
    for row in open(phasefile):
        atoms = row.rstrip().split("\t")
        # see formats.agp.phase() for column contents
        accession, phase, clone = atoms[0], atoms[1], atoms[-1]
        fafile = op.join(fastadir, accession + ".fa")
        accession_nv = accession.split(".", 1)[0]

        newid = names[accession_nv]
        newidopt = "--newid={0}".format(newid)
        cloneopt = "--clone={0}".format(clone)
        splitfile, gaps = sequin([fafile, newidopt, cloneopt])
        splitfile = op.basename(splitfile)
        phase = int(phase)
        assert phase in (1, 2, 3)

        oldphase = phase
        if gaps == 0 and phase != 3:
            phase = 3

        if gaps != 0 and phase == 3:
            phase = 2

        print >> newphasefw, "{0}\t{1}\t{2}".\
                format(accession_nv, oldphase, phase)
        newph.append(phase)

        qualifiers = qq.format(phase=phase)
        if ";" in clone:
            qualifiers += " [keyword=HTGS_POOLED_MULTICLONE]"

        cmd = acmd.format(accession=accession,
                          accession_nv=accession_nv,
                          sqndir=sqndir,
                          sbtfile=sbtfile,
                          splitfile=splitfile,
                          qualifiers=qualifiers,
                          comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    stem_leaf_plot(newph, 1, 4, 3, title="Counts of phases after updates")
    print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
Beispiel #46
0
def longest(args):
    """
    %prog longest pasa.fasta output.subclusters.out

    Find the longest PASA assembly and label it as full-length. Also removes
    transcripts shorter than half the length of the longest, or shorter than
    200bp. The assemblies for the same locus is found in
    `output.subclusters.out`. In particular the lines that look like:

    sub-cluster: asmbl_25 asmbl_26 asmbl_27
    """
    from jcvi.formats.fasta import Fasta, SeqIO
    from jcvi.formats.sizes import Sizes

    p = OptionParser(longest.__doc__)
    p.add_option("--prefix",
                 default="pasa",
                 help="Replace asmbl_ with prefix [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, subclusters = args
    prefix = fastafile.rsplit(".", 1)[0]

    idsfile = prefix + ".fl.ids"
    fw = open(idsfile, "w")
    sizes = Sizes(fastafile).mapping

    name_convert = lambda x: x.replace("asmbl", opts.prefix)

    keep = set()  # List of IDs to write
    fp = open(subclusters)
    nrecs = 0
    for row in fp:
        if not row.startswith("sub-cluster:"):
            continue
        asmbls = row.split()[1:]
        longest_asmbl = max(asmbls, key=lambda x: sizes[x])
        longest_size = sizes[longest_asmbl]
        print >> fw, name_convert(longest_asmbl)
        nrecs += 1
        cutoff = max(longest_size / 2, 200)
        keep.update(set(x for x in asmbls if sizes[x] >= cutoff))

    fw.close()
    logging.debug("{0} fl-cDNA records written to `{1}`.".format(
        nrecs, idsfile))

    f = Fasta(fastafile, lazy=True)
    newfastafile = prefix + ".clean.fasta"
    fw = open(newfastafile, "w")
    nrecs = 0
    for name, rec in f.iteritems_ordered():
        if name not in keep:
            continue

        rec.id = name_convert(name)
        rec.description = ""
        SeqIO.write([rec], fw, "fasta")
        nrecs += 1

    fw.close()
    logging.debug("{0} valid records written to `{1}`.".format(
        nrecs, newfastafile))
Beispiel #47
0
def names(args):
    """
    %prog names namelist templatefile

    Generate name blocks from the `namelist` file. The `namelist` file is
    tab-delimited that contains >=4 columns of data. Three columns are mandatory.
    First name, middle initial and last name. First row is table header. For the
    extra columns, the first column will go in the `$N0` field in the template
    file, second to the `$N1` field, etc.

    In the alternative mode, the namelist just contains several sections. First
    row will go in the `$N0` in the template file, second to the `$N1` field.

    The namelist may look like:
    [Sequence]
    Bruce A. Roe,  Frederic Debelle, Giles Oldroyd, Rene Geurts
    [Manuscript]
    Haibao Tang1, Vivek Krishnakumar1, Shelby Bidwell1, Benjamin Rosen1

    Then in this example Sequence section goes into N0, Manuscript goes into N1.

    Useful hints for constructing the template file can be found in:
    <http://www.ncbi.nlm.nih.gov/IEB/ToolBox/CPP_DOC/asn_spec/seq.asn.html>

    Often the template file can be retrieved from web form:
    <http://www.ncbi.nlm.nih.gov/WebSub/template.cgi>
    """
    p = OptionParser(names.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    namelist, templatefile = args

    # First check the alternative format
    if open(namelist).read()[0] == '[':
        out = parse_names(namelist)
        make_template(templatefile, out)
        return

    reader = csv.reader(open(namelist), delimiter="\t")
    header = reader.next()
    ncols = len(header)
    assert ncols > 3
    nextras = ncols - 3

    blocks = []
    bools = []
    for row in reader:
        first, middle, last = row[:3]
        extras = row[3:]
        bools.append([(x.upper() == 'Y') for x in extras])
        middle = middle.strip()
        if middle != "":
            middle = middle.rstrip('.') + '.'
        initials = "{0}.{1}".format(first[0], middle)
        suffix = ""
        nameblock = NameTemplate.format(last=last,
                                        first=first,
                                        initials=initials,
                                        suffix=suffix)
        blocks.append(nameblock)

    selected_idx = zip(*bools)
    out = [] * nextras
    for i, sbools in enumerate(selected_idx):
        selected = []
        for b, ss in zip(blocks, sbools):
            if ss:
                selected.append(b)
        bigblock = ",\n".join(selected)
        out.append(bigblock)
        logging.debug("List N{0} contains a total of {1} names.".format(
            i, len(selected)))

    make_template(templatefile, out)
Beispiel #48
0
def gss(args):
    """
    %prog gss fastafile plateMapping

    Generate sequence files and metadata templates suited for gss submission.
    The FASTA file is assumed to be exported from the JCVI data delivery folder
    which looks like:

    >1127963806024 /library_name=SIL1T054-B-01-120KB /clear_start=0
    /clear_end=839 /primer_id=1049000104196 /trace_id=1064147620169
    /trace_file_id=1127963805941 /clone_insert_id=1061064364776
    /direction=reverse /sequencer_run_id=1064147620155
    /sequencer_plate_barcode=B906423 /sequencer_plate_well_coordinates=C3
    /sequencer_plate_96well_quadrant=1 /sequencer_plate_96well_coordinates=B02
    /template_plate_barcode=CC0251602AB /growth_plate_barcode=BB0273005AB
    AGCTTTAGTTTCAAGGATACCTTCATTGTCATTCCCGGTTATGATGATATCATCAAGATAAACAAGAATG
    ACAATGATACCTGTTTGGTTCTGAAGTGTAAAGAGGGTATGTTCAGCTTCAGATCTTCTAAACCCTTTGT
    CTAGTAAGCTGGCACTTAGCTTCCTATACCAAACCCTTTGTGATTGCTTCAGTCCATAAATTGCCTTTTT

    Plate mapping file maps the JTC `sequencer_plate_barcode` to external IDs.
    For example:
    B906423 SIL-001
    """
    p = OptionParser(gss.__doc__)

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(p.print_help())

    fastafile, mappingfile = args
    seen = defaultdict(int)
    clone = defaultdict(set)

    plateMapping = DictFile(mappingfile)

    fw = open("MetaData.txt", "w")
    print >> fw, PublicationTemplate.format(**vars)
    print >> fw, LibraryTemplate.format(**vars)
    print >> fw, ContactTemplate.format(**vars)
    logging.debug("Meta data written to `{0}`".format(fw.name))

    fw = open("GSS.txt", "w")
    fw_log = open("GSS.log", "w")
    for rec in SeqIO.parse(fastafile, "fasta"):
        # First pass just check well number matchings and populate sequences in
        # the same clone
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        sequencer_plate_96well_quadrant = \
            a["sequencer_plate_96well_quadrant"][0]
        sequencer_plate_96well_coordinates = \
            a["sequencer_plate_96well_coordinates"][0]

        # Check the 96-well ID is correctly converted to 384-well ID
        w96 = sequencer_plate_96well_coordinates
        w96quad = int(sequencer_plate_96well_quadrant)
        w384 = sequencer_plate_well_coordinates
        assert convert_96_to_384(w96, w96quad) == w384

        plate = sequencer_plate_barcode
        assert plate in plateMapping, \
            "{0} not found in `{1}` !".format(plate, mappingfile)

        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            gssID = "{0}{1}".format(gssID, seen[gssID])

        seen[gssID] += 1
        clone[cloneID].add(gssID)

    seen = defaultdict(int)
    for rec in SeqIO.parse(fastafile, "fasta"):
        # need to populate gssID, mateID, cloneID, seq, plate, row, column
        description = rec.description
        a = parse_description(description)
        direction = a["direction"][0]
        sequencer_plate_barcode = a["sequencer_plate_barcode"][0]
        sequencer_plate_well_coordinates = \
            a["sequencer_plate_well_coordinates"][0]
        w384 = sequencer_plate_well_coordinates

        plate = sequencer_plate_barcode
        plate = plateMapping[plate]
        d = Directions[direction]

        cloneID = "{0}{1}".format(plate, w384)
        gssID = "{0}{1}".format(cloneID, d)
        seen[gssID] += 1

        if seen[gssID] > 1:
            logging.error("duplicate key {0} found".format(gssID))
            gssID = "{0}{1}".format(gssID, seen[gssID])

        othergss = clone[cloneID] - set([gssID])
        othergss = ", ".join(sorted(othergss))
        vars.update(locals())

        print >> fw, GSSTemplate.format(**vars)

        # Write conversion logs to log file
        print >> fw_log, "{0}\t{1}".format(gssID, description)
        print >> fw_log, "=" * 60

    logging.debug("A total of {0} seqs written to `{1}`".\
            format(len(seen), fw.name))
    fw.close()
    fw_log.close()
Beispiel #49
0
def pairinplace(args):
    """
    %prog pairinplace bulk.fastq

    Pair up the records in bulk.fastq by comparing the names for adjancent
    records. If they match, print to bulk.pairs.fastq, else print to
    bulk.frags.fastq.
    """
    from jcvi.utils.iter import pairwise

    p = OptionParser(pairinplace.__doc__)
    p.set_rclip()
    p.set_tag()
    p.add_option("--base",
                 help="Base name for the output files [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastqfile, = args
    base = opts.base or op.basename(fastqfile).split(".")[0]

    frags = base + ".frags.fastq"
    pairs = base + ".pairs.fastq"
    if fastqfile.endswith(".gz"):
        frags += ".gz"
        pairs += ".gz"

    fragsfw = must_open(frags, "w")
    pairsfw = must_open(pairs, "w")

    N = opts.rclip
    tag = opts.tag
    strip_name = (lambda x: x[:-N]) if N else None

    fh_iter = iter_fastq(fastqfile, key=strip_name)
    skipflag = False  # controls the iterator skip
    for a, b in pairwise(fh_iter):
        if b is None:  # hit the eof
            break

        if skipflag:
            skipflag = False
            continue

        if a.name == b.name:
            if tag:
                a.name += "/1"
                b.name += "/2"
            print >> pairsfw, a
            print >> pairsfw, b
            skipflag = True
        else:
            print >> fragsfw, a

    # don't forget the last one, when b is None
    if not skipflag:
        print >> fragsfw, a

    logging.debug("Reads paired into `%s` and `%s`" % (pairs, frags))
    return pairs
Beispiel #50
0
def htgnew(args):
    """
    %prog htgnew fastafile phasefile template.sbt

    Prepare sqnfiles for submitting new Genbank HTG records.

    `fastafile` contains the sequences.
    `phasefile` contains the phase information, it is a two column file:

    mth2-45h12    3

    `template.sbt` is the Genbank submission template.

    This function is simpler than htg, since the record names have not be
    assigned yet (so less bookkeeping).
    """
    from jcvi.formats.fasta import sequin

    p = OptionParser(htgnew.__doc__)
    p.add_option("--comment",
                 default="",
                 help="Comments for this submission [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    fastafile, phasefile, sbtfile = args
    comment = opts.comment

    fastadir = "fasta"
    sqndir = "sqn"
    mkdir(fastadir)
    mkdir(sqndir)

    cmd = "faSplit byname {0} {1}/".format(fastafile, fastadir)
    sh(cmd, outfile="/dev/null", errfile="/dev/null")

    acmd = 'tbl2asn -a z -p fasta -r {sqndir}'
    acmd += ' -i {splitfile} -t {sbtfile} -C tigr'
    acmd += ' -j "[tech=htgs {phase}] [organism=Medicago truncatula] [strain=A17]"'
    acmd += ' -o {sqndir}/{accession_nv}.sqn -V Vbr'
    acmd += ' -y "{comment}" -W T -T T'

    nupdated = 0
    for row in open(phasefile):
        name, phase = row.split()[:2]
        fafile = op.join(fastadir, name + ".fa")
        cloneopt = "--clone={0}".format(name)
        splitfile, gaps = sequin([fafile, cloneopt])
        splitfile = op.basename(splitfile)
        accession = accession_nv = name

        phase = int(phase)
        assert phase in (1, 2, 3)

        cmd = acmd.format(accession_nv=accession_nv,
                          sqndir=sqndir,
                          sbtfile=sbtfile,
                          splitfile=splitfile,
                          phase=phase,
                          comment=comment)
        sh(cmd)

        verify_sqn(sqndir, accession)
        nupdated += 1

    print >> sys.stderr, "A total of {0} records updated.".format(nupdated)
Beispiel #51
0
def phytozome(args):
    """
    %prog phytozome species

    Retrieve genomes and annotations from phytozome using Globus API. Available
    species listed below. Use comma to give a list of species to download. For
    example:

    $ %prog phytozome Athaliana,Vvinifera,Osativa,Sbicolor,Slycopersicum

    The downloader will prompt you to enter Phytozome user name and password
    during downloading. Please register for a login at:
    https://phytozome.jgi.doe.gov/pz/portal.html.
    """
    from jcvi.apps.biomart import GlobusXMLParser

    p = OptionParser(phytozome.__doc__)
    p.add_option(
        "--version",
        default="12",
        choices=("9", "10", "11", "12", "12_unrestricted"),
        help="Phytozome version",
    )
    p.add_option(
        "--assembly",
        default=False,
        action="store_true",
        help="Download assembly [default: %default]",
    )
    p.add_option(
        "--format",
        default=False,
        action="store_true",
        help="Format to CDS and BED for synteny inference",
    )
    opts, args = p.parse_args(args)

    cookies = get_cookies()
    directory_listing = ".phytozome_directory_V{}.xml".format(opts.version)
    # Get directory listing
    base_url = "http://genome.jgi.doe.gov"
    dlist = "{}/ext-api/downloads/get-directory?organism=PhytozomeV{}".format(
        base_url, opts.version)
    d = download(dlist, filename=directory_listing, cookies=cookies)
    g = GlobusXMLParser(directory_listing)
    genomes = g.get_genomes()
    valid_species = genomes.keys()
    species_tile = tile(valid_species)
    p.set_usage("\n".join((phytozome.__doc__, species_tile)))

    if len(args) != 1:
        sys.exit(not p.print_help())

    species, = args
    if species == "all":
        species = ",".join(valid_species)

    species = species.split(",")
    for s in species:
        res = download_species_phytozome(genomes,
                                         s,
                                         valid_species,
                                         base_url,
                                         cookies,
                                         assembly=opts.assembly)
        if not res:
            logging.error("No files downloaded")
        gff, fa = res.get("gff"), res.get("cds")
        if opts.format:
            format_bed_and_cds(s, gff, fa)
Beispiel #52
0
def main():
    p = OptionParser(__doc__)
    p.add_option("--switch", help="Rename the seqid with two-column file")
    p.add_option("--tree", help="Display trees on the bottom of the figure")
    p.add_option("--extra", help="Extra features in BED format")
    p.add_option(
        "--genelabelsize",
        default=0,
        type="int",
        help="Show gene labels at this font size, useful for debugging. " +
        "However, plot may appear visually crowded. " +
        "Reasonably good values are 2 to 6 [Default: disabled]",
    )
    p.add_option(
        "--scalebar",
        default=False,
        action="store_true",
        help="Add scale bar to the plot",
    )
    p.add_option(
        "--glyphstyle",
        default="box",
        choices=Glyph.Styles,
        help="Style of feature glyphs",
    )
    p.add_option(
        "--shadestyle",
        default="curve",
        choices=Shade.Styles,
        help="Style of syntenic wedges",
    )
    opts, args, iopts = p.set_image_options(figsize="8x7")

    if len(args) != 3:
        sys.exit(not p.print_help())

    datafile, bedfile, layoutfile = args
    switch = opts.switch
    tree = opts.tree

    pf = datafile.rsplit(".", 1)[0]
    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    Synteny(
        fig,
        root,
        datafile,
        bedfile,
        layoutfile,
        switch=switch,
        tree=tree,
        extra_features=opts.extra,
        genelabelsize=opts.genelabelsize,
        scalebar=opts.scalebar,
        shadestyle=opts.shadestyle,
        glyphstyle=opts.glyphstyle,
    )

    root.set_xlim(0, 1)
    root.set_ylim(0, 1)
    root.set_axis_off()

    image_name = pf + "." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Beispiel #53
0
def astat(args):
    """
    %prog astat coverage.log

    Create coverage-rho scatter plot.
    """
    p = OptionParser(astat.__doc__)
    p.add_option("--cutoff", default=1000, type="int",
                 help="Length cutoff [default: %default]")
    p.add_option("--genome", default="",
                 help="Genome name [default: %default]")
    p.add_option("--arrDist", default=False, action="store_true",
                 help="Use arrDist instead [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    covfile, = args
    cutoff = opts.cutoff
    genome = opts.genome
    plot_arrDist = opts.arrDist

    suffix = ".{0}".format(cutoff)
    small_covfile = covfile + suffix
    update_covfile = need_update(covfile, small_covfile)
    if update_covfile:
        fw = open(small_covfile, "w")
    else:
        logging.debug("Found `{0}`, will use this one".format(small_covfile))
        covfile = small_covfile

    fp = open(covfile)
    header = fp.next()
    if update_covfile:
        fw.write(header)

    data = []
    msg = "{0} tigs scanned ..."
    for row in fp:
        tigID, rho, covStat, arrDist = row.split()
        tigID = int(tigID)
        if tigID % 1000000 == 0:
            sys.stderr.write(msg.format(tigID) + "\r")

        rho, covStat, arrDist = [float(x) for x in (rho, covStat, arrDist)]
        if rho < cutoff:
            continue

        if update_covfile:
            fw.write(row)
        data.append((tigID, rho, covStat, arrDist))

    print >> sys.stderr, msg.format(tigID)

    from jcvi.graphics.base import plt, savefig

    logging.debug("Plotting {0} data points.".format(len(data)))
    tigID, rho, covStat, arrDist = zip(*data)

    y = arrDist if plot_arrDist else covStat
    ytag = "arrDist" if plot_arrDist else "covStat"

    fig = plt.figure(1, (7, 7))
    ax = fig.add_axes([.12, .1, .8, .8])
    ax.plot(rho, y, ".", color="lightslategrey")

    xtag = "rho"
    info = (genome, xtag, ytag)
    title = "{0} {1} vs. {2}".format(*info)
    ax.set_title(title)
    ax.set_xlabel(xtag)
    ax.set_ylabel(ytag)

    if plot_arrDist:
        ax.set_yscale('log')

    imagename = "{0}.png".format(".".join(info))
    savefig(imagename, dpi=150)
Beispiel #54
0
def entrez(args):
    """
    %prog entrez <filename|term>

    `filename` contains a list of terms to search. Or just one term. If the
    results are small in size, e.g. "--format=acc", use "--batchsize=100" to speed
    the download.
    """
    p = OptionParser(entrez.__doc__)

    allowed_databases = {
        "fasta": ["genome", "nuccore", "nucgss", "protein", "nucest"],
        "asn.1": ["genome", "nuccore", "nucgss", "protein", "gene"],
        "xml": ["genome", "nuccore", "nucgss", "nucest", "gene"],
        "gb": ["genome", "nuccore", "nucgss"],
        "est": ["nucest"],
        "gss": ["nucgss"],
        "acc": ["nuccore"],
    }

    valid_formats = tuple(allowed_databases.keys())
    valid_databases = ("genome", "nuccore", "nucest", "nucgss", "protein",
                       "gene")

    p.add_option(
        "--noversion",
        dest="noversion",
        default=False,
        action="store_true",
        help="Remove trailing accession versions",
    )
    p.add_option(
        "--format",
        default="fasta",
        choices=valid_formats,
        help="download format [default: %default]",
    )
    p.add_option(
        "--database",
        default="nuccore",
        choices=valid_databases,
        help="search database [default: %default]",
    )
    p.add_option(
        "--retmax",
        default=1000000,
        type="int",
        help="how many results to return [default: %default]",
    )
    p.add_option(
        "--skipcheck",
        default=False,
        action="store_true",
        help="turn off prompt to check file existence [default: %default]",
    )
    p.add_option(
        "--batchsize",
        default=500,
        type="int",
        help="download the results in batch for speed-up [default: %default]",
    )
    p.set_outdir(outdir=None)
    p.add_option("--outprefix",
                 default="out",
                 help="output file name prefix [default: %default]")
    p.set_email()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    filename, = args
    if op.exists(filename):
        pf = filename.rsplit(".", 1)[0]
        list_of_terms = [row.strip() for row in open(filename)]
        if opts.noversion:
            list_of_terms = [x.rsplit(".", 1)[0] for x in list_of_terms]
    else:
        pf = filename
        # the filename is the search term
        list_of_terms = [filename.strip()]

    fmt = opts.format
    database = opts.database
    batchsize = opts.batchsize

    assert (database in allowed_databases[fmt]
            ), "For output format '{0}', allowed databases are: {1}".format(
                fmt, allowed_databases[fmt])
    assert batchsize >= 1, "batchsize must >= 1"

    if " " in pf:
        pf = opts.outprefix

    outfile = "{0}.{1}".format(pf, fmt)

    outdir = opts.outdir
    if outdir:
        mkdir(outdir)

    # If noprompt, will not check file existence
    if not outdir:
        fw = must_open(outfile,
                       "w",
                       checkexists=True,
                       skipcheck=opts.skipcheck)
        if fw is None:
            return

    seen = set()
    totalsize = 0
    for id, size, term, handle in batch_entrez(
            list_of_terms,
            retmax=opts.retmax,
            rettype=fmt,
            db=database,
            batchsize=batchsize,
            email=opts.email,
    ):
        if outdir:
            outfile = urljoin(outdir, "{0}.{1}".format(term, fmt))
            fw = must_open(outfile,
                           "w",
                           checkexists=True,
                           skipcheck=opts.skipcheck)
            if fw is None:
                continue

        rec = handle.read()
        if id in seen:
            logging.error("Duplicate key ({0}) found".format(rec))
            continue

        totalsize += size
        print(rec, file=fw)
        print(file=fw)

        seen.add(id)

    if seen:
        print(
            "A total of {0} {1} records downloaded.".format(
                totalsize, fmt.upper()),
            file=sys.stderr,
        )

    return outfile
Beispiel #55
0
def shred(args):
    """
    %prog shred fastafile

    Similar to the method of `shredContig` in runCA script. The contigs are
    shredded into pseudo-reads with certain length and depth.
    """
    p = OptionParser(shred.__doc__)
    p.set_depth(depth=2)
    p.add_option("--readlen", default=1000, type="int",
            help="Desired length of the reads [default: %default]")
    p.add_option("--minctglen", default=0, type="int",
            help="Ignore contig sequence less than [default: %default]")
    p.add_option("--shift", default=50, type="int",
            help="Overlap between reads must be at least [default: %default]")
    p.add_option("--fasta", default=False, action="store_true",
            help="Output shredded reads as FASTA sequences [default: %default]")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    libID = fastafile.split(".")[0]
    depth = opts.depth
    readlen = opts.readlen
    shift = opts.shift

    outfile = libID + ".depth{0}".format(depth)
    if opts.fasta:
        outfile += ".fasta"
    else:
        outfile += ".frg"
    f = Fasta(fastafile, lazy=True)

    fw = must_open(outfile, "w", checkexists=True)
    if not opts.fasta:
       print >> fw, headerTemplate.format(libID=libID)

    """
    Taken from runCA:

                    |*********|
                    |###################|
    |--------------------------------------------------|
     ---------------1---------------
               ---------------2---------------
                         ---------------3---------------
    *** - center_increments
    ### - center_range_width
    """
    for ctgID, (name, rec) in enumerate(f.iteritems_ordered()):
        seq = rec.seq
        seqlen = len(seq)
        if seqlen < opts.minctglen:
            continue

        shredlen = min(seqlen - shift, readlen)
        numreads = max(seqlen * depth / shredlen, 1)
        center_range_width = seqlen - shredlen

        ranges = []
        if depth == 1:
            if seqlen < readlen:
                ranges.append((0, seqlen))
            else:
                for begin in xrange(0, seqlen, readlen - shift):
                    end = min(seqlen, begin + readlen)
                    ranges.append((begin, end))
        else:
            if numreads == 1:
                ranges.append((0, shredlen))
            else:
                prev_begin = -1
                center_increments = center_range_width * 1. / (numreads - 1)
                for i in xrange(numreads):
                    begin = center_increments * i
                    end = begin + shredlen
                    begin, end = int(begin), int(end)

                    if begin == prev_begin:
                        continue

                    ranges.append((begin, end))
                    prev_begin = begin

        for shredID, (begin, end) in enumerate(ranges):
            shredded_seq = seq[begin:end]
            fragID = "{0}.{1}.frag{2}.{3}-{4}".format(libID, ctgID, shredID, begin, end)
            emitFragment(fw, fragID, libID, shredded_seq, fasta=opts.fasta)

    fw.close()
    logging.debug("Shredded reads are written to `{0}`.".format(outfile))
    return outfile
Beispiel #56
0
def wgsim(args):
    """
    %prog wgsim fastafile

    Run dwgsim on fastafile.
    """
    p = OptionParser(wgsim.__doc__)
    p.add_option("--erate",
                 default=.02,
                 type="float",
                 help="Base error rate of the read [default: %default]")
    p.add_option(
        "--distance",
        default=500,
        type="int",
        help="Outer distance between the two ends [default: %default]")
    p.add_option("--genomesize",
                 type="int",
                 help="Genome size in Mb [default: estimate from data]")
    p.add_option("--readlen",
                 default=100,
                 type="int",
                 help="Length of the read [default: %default]")
    p.add_option("--noerrors",
                 default=False,
                 action="store_true",
                 help="Simulate reads with no errors [default: %default]")
    p.set_depth(depth=10)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    pf = fastafile.split(".")[0]

    genomesize = opts.genomesize
    size = genomesize * 1000000 if genomesize else Fasta(fastafile).totalsize
    depth = opts.depth
    readlen = opts.readlen
    readnum = size * depth / (2 * readlen)

    distance = opts.distance
    stdev = distance / 5

    outpf = "{0}.{1}bp.{2}x".format(pf, distance, depth)
    distance -= 2 * readlen  # Outer distance => Inner distance
    assert distance >= 0, "Outer distance must be >= 2 * readlen"

    logging.debug("Total genome size: {0} bp".format(size))
    logging.debug("Target depth: {0}x".format(depth))
    logging.debug("Number of read pairs (2x{0}): {1}".format(readlen, readnum))

    if opts.noerrors:
        opts.erate = 0

    cmd = "dwgsim -e {0} -E {0}".format(opts.erate)
    if opts.noerrors:
        cmd += " -r 0 -R 0 -X 0 -y 0"

    cmd += " -d {0} -s {1}".format(distance, stdev)
    cmd += " -N {0} -1 {1} -2 {1}".format(readnum, readlen)
    cmd += " {0} {1}".format(fastafile, outpf)
    sh(cmd)
Beispiel #57
0
def simulate(args):
    """
    %prog simulate run_dir 1 300

    Simulate BAMs with varying inserts with dwgsim. The above command will
    simulate between 1 to 300 CAGs in the HD region, in a directory called
    `run_dir`.
    """
    p = OptionParser(simulate.__doc__)
    p.add_option("--ref",
                 default="/Users/htang/projects/ref/hg38.upper.fa",
                 help="Reference genome sequence")
    add_simulate_options(p)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    rundir, startunits, endunits = args
    startunits, endunits = int(startunits), int(endunits)
    basecwd = os.getcwd()
    mkdir(rundir)
    os.chdir(rundir)
    cwd = os.getcwd()

    # Huntington region
    pad_left, pad_right = 1000, 10000
    chr, start, end = 'chr4', 3074877, 3074933
    fasta = Fasta(opts.ref)
    seq_left = fasta[chr][start - pad_left:start - 1]
    seq_right = fasta[chr][end:end + pad_right]
    motif = 'CAG'
    reffastafile = "ref.fasta"
    seq = str(fasta[chr][start - pad_left:end + pad_right])
    make_fasta(seq, reffastafile, id=chr.upper())

    # Write fake sequence
    for units in range(startunits, endunits + 1):
        pf = str(units)
        mkdir(pf)
        os.chdir(pf)
        seq = str(seq_left) + motif * units + str(seq_right)
        fastafile = pf + ".fasta"
        make_fasta(seq, fastafile, id=chr.upper())

        # Simulate reads on it
        wgsim([
            fastafile, "--depth={}".format(opts.depth),
            "--readlen={}".format(opts.readlen),
            "--distance={}".format(opts.distance), "--outfile={}".format(pf)
        ])

        read1 = pf + ".bwa.read1.fastq"
        read2 = pf + ".bwa.read2.fastq"
        samfile, _ = align(["../{}".format(reffastafile), read1, read2])
        indexed_samfile = index([samfile])

        sh("mv {} ../{}.bam".format(indexed_samfile, pf))
        sh("mv {}.bai ../{}.bam.bai".format(indexed_samfile, pf))

        os.chdir(cwd)
        shutil.rmtree(pf)

    os.chdir(basecwd)
Beispiel #58
0
def fasta(args):
    """
    %prog fasta fastafile

    Convert reads formatted as FASTA file, and convert to CA frg file. If .qual
    file is found, then use it, otherwise just make a fake qual file. Mates are
    assumed as adjacent sequence records (i.e. /1, /2, /1, /2 ...) unless a
    matefile is given.
    """
    from jcvi.formats.fasta import clean, make_qual

    p = OptionParser(fasta.__doc__)
    p.add_option("--clean", default=False, action="store_true",
                 help="Clean up irregular chars in seq")
    p.add_option("--matefile", help="Matepairs file")
    p.add_option("--maxreadlen", default=0, type="int",
                 help="Maximum read length allowed")
    p.add_option("--minreadlen", default=1000, type="int",
                 help="Minimum read length allowed")
    p.add_option("--readname", default=False, action="store_true",
                 help="Keep read name (e.g. long Pacbio name)")
    p.set_size()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    maxreadlen = opts.maxreadlen
    minreadlen = opts.minreadlen
    if maxreadlen > 0:
        split = False
        f = Fasta(fastafile, lazy=True)
        for id, size in f.itersizes_ordered():
            if size > maxreadlen:
                logging.debug("Sequence {0} (size={1}) longer than max read len {2}".\
                                format(id, size, maxreadlen))
                split = True
                break

        if split:
            for f in split_fastafile(fastafile, maxreadlen=maxreadlen):
                fasta([f, "--maxreadlen=0"])
            return

    plate = op.basename(fastafile).split(".")[0]

    mated = (opts.size != 0)
    mean, sv = get_mean_sv(opts.size)

    if mated:
        libname = "Sanger{0}Kb-".format(opts.size / 1000) + plate
    else:
        libname = plate[:2].upper()

    frgfile = libname + ".frg"

    if opts.clean:
        cleanfasta = fastafile.rsplit(".", 1)[0] + ".clean.fasta"
        if need_update(fastafile, cleanfasta):
            clean([fastafile, "--canonical", "-o", cleanfasta])
        fastafile = cleanfasta

    if mated:
        qualfile = make_qual(fastafile, score=21)
        if opts.matefile:
            matefile = opts.matefile
            assert op.exists(matefile)
        else:
            matefile = make_matepairs(fastafile)

        cmd = "convert-fasta-to-v2.pl"
        cmd += " -l {0} -s {1} -q {2} ".format(libname, fastafile, qualfile)
        if mated:
            cmd += "-mean {0} -stddev {1} -m {2} ".format(mean, sv, matefile)

        sh(cmd, outfile=frgfile)
        return

    fw = must_open(frgfile, "w")
    print >> fw, headerTemplate.format(libID=libname)

    sequential = not opts.readname
    f = Fasta(fastafile, lazy=True)
    i = j = 0
    for fragID, seq in parse_fasta(fastafile):
        if len(seq) < minreadlen:
            j += 1
            continue
        i += 1
        if sequential:
            fragID = libname + str(100000000 + i)
        emitFragment(fw, fragID, libname, seq)
    fw.close()

    logging.debug("A total of {0} fragments written to `{1}` ({2} discarded).".\
                    format(i, frgfile, j))
Beispiel #59
0
def diagram(args):
    """
    %prog diagram

    Plot the predictive power of various evidences.
    """
    p = OptionParser(diagram.__doc__)
    opts, args, iopts = p.set_image_options(args, figsize="8x4")

    if len(args) != 0:
        sys.exit(not p.print_help())

    fig = plt.figure(1, (iopts.w, iopts.h))
    root = fig.add_axes([0, 0, 1, 1])

    # Gauge on top, this is log-scale
    yy = .7
    yinterval = .1
    height = .05
    yp = yy - yinterval - height
    canvas = .95
    xstart = .025
    convert = lambda x: xstart + x * canvas / 1200
    # Symbols
    root.text(.5,
              .9,
              r"$L$: Read length, $F$: Flank size, $V$: Pair distance",
              ha="center")
    root.text(.5, .85, r"ex. $L=150bp, F=9bp, V=500bp$", ha="center")
    root.text(xstart + canvas,
              yy - height,
              "STR repeat length",
              ha="center",
              color=lsg,
              size=10)

    # Mark the key events
    pad = .02
    arrowlen = canvas * 1.05
    arrowprops = dict(length_includes_head=True,
                      width=.01,
                      fc=lsg,
                      lw=0,
                      head_length=arrowlen * .12,
                      head_width=.04)
    p = FancyArrow(xstart, yy, arrowlen, 0, shape="right", **arrowprops)
    root.add_patch(p)

    ppad = 30
    keyevents = (
        (0, 0, -1, r"$0$"),
        (150 - 18, 150 - 18 - ppad, 0, r"$L - 2F$"),
        (150 - 9, 150 - 9, 1, r"$L - F$"),
        (150, 150 + ppad, 2, r"$L$"),
        (500 - 9, 500 - 9, 3, r"$V - F$"),
        (500 * 2 - 18, 500 * 2 - 18, 2, r"$2(V - F)$"),
    )
    for event, pos, i, label in keyevents:
        _event = convert(event)
        _pos = convert(pos)
        root.plot((_event, _event), (yy - height / 4, yy + height / 4),
                  '-',
                  color='k')
        root.text(_pos, yy + pad, label, rotation=45, va="bottom", size=8)
        if i < 0:
            continue
        ystart = yp - i * yinterval
        root.plot((_event, _event), (ystart, yy - height / 4), ':', color=lsg)

    # Range on bottom. These are simple 4 rectangles, with the range indicating
    # the predictive range.
    CLOSED, OPEN = range(2)
    ranges = (
        (0, 150 - 18, CLOSED, "Spanning reads"),
        (9, 150 - 9, OPEN, "Partial reads"),
        (150, 500 * 2 - 18, CLOSED, "Repeat reads"),
        (0, 500 - 9, CLOSED, "Paired-end reads"),
    )
    for start, end, starttag, label in ranges:
        _start = convert(start)
        _end = convert(end)
        data = [[0., 1.], [0., 1.]] if starttag == OPEN else \
               [[1., 0.], [1., 0.]]
        root.imshow(data,
                    interpolation='bicubic',
                    cmap=plt.cm.Greens,
                    extent=[_start, _end, yp, yp + height])
        root.text(_end + pad, yp + height / 2, label, va="center")
        yp -= yinterval

    normalize_axes(root)

    image_name = "diagram." + iopts.format
    savefig(image_name, dpi=iopts.dpi, iopts=iopts)
Beispiel #60
0
def evidences(args):
    """
    %prog evidences

    Plot distribution of evidences against two factors:
    - Sample mean coverage
    - Longer allele
    """
    p = OptionParser(evidences.__doc__)
    p.add_option("--csv",
                 default="hli.20170328.tred.tsv",
                 help="TRED csv output to plot")
    opts, args, iopts = p.set_image_options(args, format="pdf")

    if len(args) != 0:
        sys.exit(not p.print_help())

    format = iopts.format

    # Extract sample coverage first
    df = pd.read_csv("qc-export-MeanCoverage.csv",
                     header=None,
                     names=["Samplekey", "MeanCoverage"],
                     index_col=0)

    # Find coverage for HD
    xf = pd.read_csv(opts.csv, sep="\t", index_col=0)
    dp = {}
    tred = "HD"
    for sk, row in xf.iterrows():
        sk = str(sk)
        a1 = row[tred + ".1"]
        a2 = row[tred + ".2"]
        fdp = row[tred + ".FDP"]
        pdp = row[tred + ".PDP"]
        pedp = row[tred + ".PEDP"]
        dp[sk] = (a1, a2, fdp, pdp, pedp)

    # Build a consolidated dataframe
    ef = pd.DataFrame.from_dict(dp, orient="index")
    ef.columns = [
        tred + ".1", tred + ".2", tred + ".FDP", tred + ".PDP", tred + ".PEDP"
    ]
    ef.index.name = "SampleKey"
    mf = df.merge(ef, how="right", left_index=True, right_index=True)

    # Plot a bunch of figures
    outdir = "output"
    mkdir(outdir)
    xlim = ylim = (0, 100)
    draw_jointplot(outdir + "/A",
                   "MeanCoverage",
                   "HD.FDP",
                   data=mf,
                   xlim=xlim,
                   ylim=ylim,
                   format=format)
    draw_jointplot(outdir + "/B",
                   "MeanCoverage",
                   "HD.PDP",
                   data=mf,
                   color='g',
                   xlim=xlim,
                   ylim=ylim,
                   format=format)
    draw_jointplot(outdir + "/C",
                   "MeanCoverage",
                   "HD.PEDP",
                   data=mf,
                   color='m',
                   xlim=xlim,
                   ylim=ylim,
                   format=format)

    xlim = (0, 50)
    draw_jointplot(outdir + "/D",
                   "HD.2",
                   "HD.FDP",
                   data=mf,
                   xlim=xlim,
                   ylim=ylim,
                   format=format)
    draw_jointplot(outdir + "/E",
                   "HD.2",
                   "HD.PDP",
                   data=mf,
                   color='g',
                   xlim=xlim,
                   ylim=ylim,
                   format=format)
    draw_jointplot(outdir + "/F",
                   "HD.2",
                   "HD.PEDP",
                   data=mf,
                   color='m',
                   xlim=xlim,
                   ylim=ylim,
                   format=format)