Esempio n. 1
0
def batchseeds(args):
    """
    %prog batchseeds folder

    Extract seed metrics for each image in a directory.
    """
    from jcvi.formats.pdf import cat

    xargs = args[1:]
    p = OptionParser(batchseeds.__doc__)
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    folder = folder.rstrip('/')
    outdir = folder + "-debug"
    outfile = folder + "-output.tsv"
    assert op.isdir(folder)
    images = []
    jsonfile = opts.calibrate or op.join(folder, "calibrate.json")
    if not op.exists(jsonfile):
        jsonfile = None
    for im in iglob(folder, "*.jpg", "*.JPG", "*.png"):
        if im.endswith(".resize.jpg") or \
           im.endswith(".main.jpg") or \
           im.endswith(".label.jpg"):
            continue
        if op.basename(im).startswith("calibrate"):
            continue
        images.append(im)

    fw = must_open(outfile, 'w')
    print >> fw, Seed.header(calibrate=jsonfile)
    nseeds = 0
    for im in images:
        imargs = [im, "--noheader", "--outdir={0}".format(outdir)] + xargs
        if jsonfile:
            imargs += ["--calibrate={0}".format(jsonfile)]
        objects = seeds(imargs)
        for o in objects:
            print >> fw, o
        nseeds += len(objects)
    fw.close()
    logging.debug("Processed {0} images.".format(len(images)))
    logging.debug("A total of {0} objects written to `{1}`.".\
                    format(nseeds, outfile))

    pdfs = iglob(outdir, "*.pdf")
    outpdf = folder + "-output.pdf"
    cat(pdfs + ["--outfile={0}".format(outpdf)])

    logging.debug("Debugging information written to `{0}`.".format(outpdf))
    return outfile
Esempio n. 2
0
def batchseeds(args):
    """
    %prog batchseeds folder

    Extract seed metrics for each image in a directory.
    """
    from jcvi.formats.pdf import cat

    xargs = args[1:]
    p = OptionParser(batchseeds.__doc__)
    opts, args, iopts = add_seeds_options(p, args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    folder = folder.rstrip('/')
    outdir = folder + "-debug"
    outfile = folder + "-output.tsv"
    assert op.isdir(folder)
    images = []
    jsonfile = opts.calibrate or op.join(folder, "calibrate.json")
    if not op.exists(jsonfile):
        jsonfile = None
    for im in iglob(folder, "*.jpg,*.JPG,*.png"):
        if im.endswith(".resize.jpg") or \
           im.endswith(".main.jpg") or \
           im.endswith(".label.jpg"):
            continue
        if op.basename(im).startswith("calibrate"):
            continue
        images.append(im)

    fw = must_open(outfile, 'w')
    print >> fw, Seed.header(calibrate=jsonfile)
    nseeds = 0
    for im in images:
        imargs = [im, "--noheader", "--outdir={0}".format(outdir)] + xargs
        if jsonfile:
            imargs += ["--calibrate={0}".format(jsonfile)]
        objects = seeds(imargs)
        for o in objects:
            print >> fw, o
        nseeds += len(objects)
    fw.close()
    logging.debug("Processed {0} images.".format(len(images)))
    logging.debug("A total of {0} objects written to `{1}`.".\
                    format(nseeds, outfile))

    pdfs = iglob(outdir, "*.pdf")
    outpdf = folder + "-output.pdf"
    cat(pdfs + ["--outfile={0}".format(outpdf)])

    logging.debug("Debugging information written to `{0}`.".format(outpdf))
    return outfile
Esempio n. 3
0
File: age.py Progetto: xuanblo/jcvi
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Esempio n. 4
0
File: hic.py Progetto: xuanblo/jcvi
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Esempio n. 5
0
File: age.py Progetto: xuanblo/jcvi
def traits(args):
    """
    %prog traits directory

    Make HTML page that reports eye and skin color.
    """
    p = OptionParser(traits.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    samples = []
    for folder in args:
        targets = iglob(folder, "*-traits.json")
        if not targets:
            continue
        filename = targets[0]
        js = json.load(open(filename))
        js["skin_rgb"] = make_rgb(
            js["traits"]["skin-color"]["L"],
            js["traits"]["skin-color"]["A"],
            js["traits"]["skin-color"]["B"])
        js["eye_rgb"] = make_rgb(
            js["traits"]["eye-color"]["L"],
            js["traits"]["eye-color"]["A"],
            js["traits"]["eye-color"]["B"])
        samples.append(js)

    template = Template(traits_template)
    fw = open("report.html", "w")
    print >> fw, template.render(samples=samples)
    logging.debug("Report written to `{}`".format(fw.name))
    fw.close()
Esempio n. 6
0
def compilevcf(args):
    """
    %prog compilevcf dir

    Compile vcf outputs into lists.
    """
    from jcvi.variation.str import LobSTRvcf

    p = OptionParser(compilevcf.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    vcf_files = iglob(folder, "*.vcf,*.vcf.gz")
    for vcf_file in vcf_files:
        try:
            p = LobSTRvcf(columnidsfile=None)
            p.parse(vcf_file, filtered=False)
            res = p.items()
            if res:
                k, v = res[0]
                res = v.replace(',', '/')
            else:
                res = "-1/-1"
            num = op.basename(vcf_file).split(".")[0]
            print num, res
        except (TypeError, AttributeError) as e:
            p = TREDPARSEvcf(vcf_file)
            continue
Esempio n. 7
0
def agp(args):
    """
    %prog agp main_results/ contigs.fasta

    Generate AGP file based on LACHESIS output.
    """
    p = OptionParser(agp.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    odir, contigsfasta = args
    fwagp = must_open(opts.outfile, 'w')
    orderingfiles = natsorted(iglob(odir, "*.ordering"))
    sizes = Sizes(contigsfasta).mapping
    contigs = set(sizes.keys())
    anchored = set()

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        anchored |= set([x.contig_name for x in co])
        obj = op.basename(ofile).split('.')[0]
        co.write_agp(obj, sizes, fwagp)

    singletons = contigs - anchored
    logging.debug('Anchored: {}, Singletons: {}'.\
                  format(len(anchored), len(singletons)))

    for s in natsorted(singletons):
        order_to_agp(s, [(s, "?")], sizes, fwagp)
Esempio n. 8
0
def scan_read_files(trimmed, patterns):
    reads = iglob(trimmed, patterns)
    samples = sorted(set(op.basename(x).split(".")[0] for x in reads))
    logging.debug(
        "Total {0} read files from {1} samples".format(len(reads), len(samples))
    )
    return reads, samples
Esempio n. 9
0
def compile(args):
    """
    %prog compile directory

    Extract telomere length and ccn.
    """
    p = OptionParser(compile.__doc__)
    p.set_outfile(outfile="age.tsv")
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    dfs = []
    for folder in args:
        ofolder = os.listdir(folder)

        # telomeres
        subdir = [x for x in ofolder if x.startswith("telomeres")][0]
        subdir = op.join(folder, subdir)
        filename = op.join(subdir, "tel_lengths.txt")
        df = pd.read_csv(filename, sep="\t")
        d1 = df.ix[0].to_dict()

        # ccn
        subdir = [x for x in ofolder if x.startswith("ccn")][0]
        subdir = op.join(folder, subdir)
        filename = iglob(subdir, "*.ccn.json")[0]
        js = json.load(open(filename))
        d1.update(js)
        df = pd.DataFrame(d1, index=[0])
        dfs.append(df)

    df = pd.concat(dfs, ignore_index=True)
    df.to_csv(opts.outfile, sep="\t", index=False)
Esempio n. 10
0
File: age.py Progetto: zjwang6/jcvi
def traits(args):
    """
    %prog traits directory

    Make HTML page that reports eye and skin color.
    """
    p = OptionParser(traits.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    samples = []
    for folder in args:
        targets = iglob(folder, "*-traits.json")
        if not targets:
            continue
        filename = targets[0]
        js = json.load(open(filename))
        js["skin_rgb"] = make_rgb(js["traits"]["skin-color"]["L"],
                                  js["traits"]["skin-color"]["A"],
                                  js["traits"]["skin-color"]["B"])
        js["eye_rgb"] = make_rgb(js["traits"]["eye-color"]["L"],
                                 js["traits"]["eye-color"]["A"],
                                 js["traits"]["eye-color"]["B"])
        samples.append(js)

    template = Template(traits_template)
    fw = open("report.html", "w")
    print(template.render(samples=samples), file=fw)
    logging.debug("Report written to `{}`".format(fw.name))
    fw.close()
Esempio n. 11
0
def mergebam(args):
    """
    %prog mergebam dir1 dir2 homo_outdir
    or
    %prog mergebam dir1 dir2/20.bam het_outdir

    Merge sets of BAMs to make diploid. Two modes:
    - Homozygous mode: pair-up the bams in the two folders and merge
    - Heterozygous mode: pair the bams in first folder with a particular bam
    """
    p = OptionParser(mergebam.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idir1, idir2, outdir = args
    dir1 = [idir1] if idir1.endswith(".bam") else iglob(idir1, "*.bam")
    dir2 = [idir2] if idir2.endswith(".bam") else iglob(idir2, "*.bam")
    nbams1 = len(dir1)
    nbams2 = len(dir2)
    # Make sure more or the same number of bams in first pile
    if nbams1 < nbams2:
        dir1, dir2 = dir2, dir1
    if nbams1 == nbams2:
        logging.debug("Homozygous mode")
    elif nbams1 > nbams2:
        assert nbams2 == 1, "Second pile must contain a single bam"
        dir2 = [idir2] * nbams1

    assert len(dir1) == len(dir2), "Two piles must contain same number of bams"
    cmd = "samtools merge {} {} {} && samtools index {}"
    cmds = []
    mkdir(outdir)
    for a, b in zip(dir1, dir2):
        ia = op.basename(a).split(".")[0]
        ib = op.basename(b).split(".")[0]
        outfile = op.join(outdir, "{}_{}.bam".format(ia, ib))
        cmds.append(cmd.format(outfile, a, b, outfile))

    p = Parallel(cmds, cpus=opts.cpus)
    p.run()
Esempio n. 12
0
def iter_project(folder, pattern, n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Esempio n. 13
0
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n or None in p:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp)
        yield list(p), pf
Esempio n. 14
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Esempio n. 15
0
def iter_project(folder, pattern="*.fq,*.fq.gz,*.fastq,*.fastq.gz", n=2,
                 commonprefix=True):
    # Check for paired reads and extract project id
    filelist = [x for x in iglob(folder, pattern)]
    for p in grouper(filelist, n):
        if len(p) != n or None in p:
            continue

        pp = [op.basename(x) for x in p]
        pf = pairspf(pp, commonprefix=commonprefix)
        yield sorted(p), pf
Esempio n. 16
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    gtf = opts.gtf
    transcripts = "transcripts.gtf"

    mm = MakeManager()
    gtfs = []
    for bam in iglob(folder, "*.bam"):
        pf = op.basename(bam).split(".")[0]
        outdir = pf + "_cufflinks"
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(cpus)
        if gtf:
            cmd += " -g {0}".format(gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        cgtf = op.join(outdir, transcripts)
        mm.add(bam, cgtf, cmd)
        gtfs.append(cgtf)

    assemblylist = "assembly_list.txt"
    cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist)
    mm.add(gtfs, assemblylist, cmd)

    mergedgtf = "merged/merged.gtf"
    cmd = "cuffmerge"
    cmd += " -o merged"
    cmd += " -p {0}".format(cpus)
    if gtf:
        cmd += " -g {0}".format(gtf)
    cmd += " -s {0}".format(reference)
    cmd += " {0}".format(assemblylist)
    mm.add(assemblylist, mergedgtf, cmd)

    mm.write()
Esempio n. 17
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch", default=31, type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore", default=MINSCORE, type="int",
                 help="Minimum score to report")
    p.add_option("--period", default=6, type="int",
                 help="Maximum period to report")
    p.add_option("--minlength", default=MINSCORE / 2, type="int",
                 help="Minimum length of repeat tract")
    p.add_option("--telomeres", default=False, action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minlength
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\
                    format(datfile, minlength, READLEN - minlength)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Esempio n. 18
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    gtf = opts.gtf
    transcripts = "transcripts.gtf"

    mm = MakeManager()
    gtfs = []
    for bam in iglob(folder, "*.bam"):
        pf = op.basename(bam).split(".")[0]
        outdir = pf + "_cufflinks"
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(cpus)
        if gtf:
            cmd += " -g {0}".format(gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        cgtf = op.join(outdir, transcripts)
        mm.add(bam, cgtf, cmd)
        gtfs.append(cgtf)

    assemblylist = "assembly_list.txt"
    cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist)
    mm.add(gtfs, assemblylist, cmd)

    mergedgtf = "merged/merged.gtf"
    cmd = "cuffmerge"
    cmd += " -o merged"
    cmd += " -p {0}".format(cpus)
    if gtf:
        cmd += " -g {0}".format(gtf)
    cmd += " -s {0}".format(reference)
    cmd += " {0}".format(assemblylist)
    mm.add(assemblylist, mergedgtf, cmd)

    mm.write()
Esempio n. 19
0
File: html.py Progetto: zjwang6/jcvi
def gallery(args):
    """
    %prog gallery folder link_prefix

    Convert a folder of figures to a HTML table. For example:

    $ python -m jcvi.formats.html gallery Paper-figures/
    https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/

    Maps the images from local to remote.
    """
    from more_itertools import grouper
    from jcvi.apps.base import iglob

    p = OptionParser(gallery.__doc__)
    p.add_option("--columns",
                 default=3,
                 type="int",
                 help="How many cells per row")
    p.add_option("--width", default=200, type="int", help="Image width")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, link_prefix = args
    width = opts.width
    images = iglob(folder, "*.jpg,*.JPG,*.png")
    td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>'
    print("<table>")
    for ims in grouper(images, opts.columns):
        print('<tr height="{0}" valign="top">'.format(width + 5))
        for im in ims:
            if not im:
                continue
            im = op.basename(im)
            pf = im.split(".")[0].replace("_", "-")
            link = link_prefix.rstrip("/") + "/" + im
            print(td.format(pf, link, width))
        print("</tr>")
    print("</table>")
Esempio n. 20
0
    def __init__(self, filename, delimiter=','):
        super(Layout, self).__init__(filename)
        if not op.exists(filename):
            ksfiles = iglob(".", "*.ks")
            header = "Ks file|ncomponents|label|color|marker".split("|")
            contents = []
            for ksfile in ksfiles:
                leg = op.basename(ksfile).rsplit(".", 1)[0]
                if leg.count(".") == 1:
                    leg = leg.replace(".", " *vs.* ")
                contents.append((ksfile, "1", leg, "", ""))
            write_csv(header, contents, comment=True, filename=filename)

        fp = open(filename)
        for row in fp:
            if row[0] == '#':
                continue
            self.append(LayoutLine(row, delimiter=delimiter))

        self.assign_colors()
        self.assign_markers()
Esempio n. 21
0
File: ks.py Progetto: ascendo/jcvi
    def __init__(self, filename, delimiter=','):
        super(Layout, self).__init__(filename)
        if not op.exists(filename):
            ksfiles = iglob(".", "*.ks")
            header = "Ks file|ncomponents|label|color|marker".split("|")
            contents = []
            for ksfile in ksfiles:
                leg = op.basename(ksfile).rsplit(".", 1)[0]
                if leg.count(".") == 1:
                    leg = leg.replace(".", " *vs.* ")
                contents.append((ksfile, "1", leg, "", ""))
            write_csv(header, contents, comment=True, filename=filename)

        fp = open(filename)
        for row in fp:
            if row[0] == '#':
                continue
            self.append(LayoutLine(row, delimiter=delimiter))

        self.assign_colors()
        self.assign_markers()
Esempio n. 22
0
def gallery(args):
    """
    %prog gallery folder link_prefix

    Convert a folder of figures to a HTML table. For example:

    $ python -m jcvi.formats.html gallery Paper-figures/
    https://dl.dropboxusercontent.com/u/15937715/Data/Paper-figures/

    Maps the images from local to remote.
    """
    from jcvi.apps.base import iglob
    from jcvi.utils.iter import grouper

    p = OptionParser(gallery.__doc__)
    p.add_option("--columns", default=3, type="int",
                 help="How many cells per row")
    p.add_option("--width", default=200, type="int",
                 help="Image width")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, link_prefix = args
    width = opts.width
    images = iglob(folder, "*.jpg,*.JPG,*.png")
    td = '<td>{0}<br><a href="{1}"><img src="{1}" width="{2}"></a></td>'
    print("<table>")
    for ims in grouper(images, opts.columns):
        print('<tr height="{0}" valign="top">'.format(width + 5))
        for im in ims:
            if not im:
                continue
            im = op.basename(im)
            pf = im.split('.')[0].replace('_', '-')
            link = link_prefix.rstrip("/") + "/" + im
            print(td.format(pf, link, width))
        print("</tr>")
    print("</table>")
Esempio n. 23
0
def stats(args):
    """
    %prog stats folder

    Generate table summarizing .stats files.
    """
    p = OptionParser(stats.__doc__)
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    statsfiles = iglob(folder, "*.stats")
    after_equal = lambda x: x.split("=")[-1]
    header = "Library Assembled_reads Contigs".split()
    contents = []
    # label=M0096 total=7443 cnts=948 mean=7.851 std=35.96
    for statsfile in statsfiles:
        fp = open(statsfile)
        for row in fp:
            if row.startswith("label="):
                break
        label, total, cnts = row.split()[:3]
        label = after_equal(label)
        reads = int(after_equal(total))
        contigs = int(after_equal(cnts))
        contents.append((label, reads, contigs))

    all_labels, all_reads, all_contigs = zip(*contents)
    contents.append(("SUM", sum(all_reads), sum(all_contigs)))
    contents.append(("AVERAGE (per sample)", \
                    int(np.mean(all_reads)), int(np.mean(all_contigs))))
    contents.append(("MEDIAN (per sample)", \
                    int(np.median(all_reads)), int(np.median(all_contigs))))
    write_csv(header, contents, filename=opts.outfile)
Esempio n. 24
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired",
                 default=False,
                 action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    assert op.exists(inparam)

    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, opts.names)
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(
        opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(
            genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
            cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmd += " --bypass_java_version_check"

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Esempio n. 25
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob
    cparams = "1 1 2 80 5 200 2000"

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--lobstr",
                 default=False,
                 action="store_true",
                 help="Generate output for lobSTR")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    p.add_option("--centromeres",
                 default=False,
                 action="store_true",
                 help="Run centromere search: {}".format(cparams))
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minscore / 2
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    if opts.centromeres:
        params = cparams.split()

    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).rsplit(".", 1)[0]
        # Commands starting with trf ignores errors
        cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | grep -v ^Parameters".format(datfile)
        if opts.lobstr:
            cmd2 += " | awk '($8 >= {} && $8 <= {})'".\
                    format(minlength, READLEN - minlength)
        else:
            cmd2 += " | awk '($8 >= 0)'"
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Esempio n. 26
0
File: tgbs.py Progetto: fw1121/jcvi
def resolve(args):
    """
    %prog resolve matrixfile fastafile bamfolder

    Separate repeats along collapsed contigs. First scan the matrixfile for
    largely heterozygous sites. For each heterozygous site, we scan each bam to
    retrieve distinct haplotypes. The frequency of each haplotype is then
    computed, the haplotype with the highest frequency, assumed to be
    paralogous, is removed.
    """
    import pysam
    from collections import defaultdict
    from itertools import groupby

    p = OptionParser(resolve.__doc__)
    p.add_option("--missing", default=.5, type="float",
                 help="Max level of missing data")
    p.add_option("--het", default=.5, type="float",
                 help="Min level of heterozygous calls")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    matrixfile, fastafile, bamfolder = args
    #f = Fasta(fastafile)
    fp = open(matrixfile)
    for row in fp:
        if row[0] != '#':
            break
    header = row.split()
    ngenotypes = len(header) - 4
    nmissing = int(round(opts.missing * ngenotypes))
    logging.debug("A total of {0} individuals scanned".format(ngenotypes))
    logging.debug("Look for markers with < {0} missing and > {1} het".\
                    format(opts.missing, opts.het))
    bamfiles = iglob(bamfolder, "*.bam")
    logging.debug("Folder `{0}` contained {1} bam files".\
                    format(bamfolder, len(bamfiles)))

    data = []
    for row in fp:
        if row[0] == '#':
            continue
        atoms = row.split()
        seqid, pos, ref, alt = atoms[:4]
        genotypes = atoms[4:]
        c = Counter(genotypes)
        c0 = c.get('0', 0)
        c3 = c.get('3', 0)
        if c0 >= nmissing:
            continue
        hetratio = c3 * 1. / (ngenotypes - c0)
        if hetratio <= opts.het:
            continue
        pos = int(pos)
        data.append((seqid, pos, ref, alt, c, hetratio))

    data.sort()
    logging.debug("A total of {0} target markers in {1} contigs.".\
                    format(len(data), len(set(x[0] for x in data))))
    samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles]
    samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles]
    samfiles.sort()
    logging.debug("BAM files grouped to {0} individuals".\
                    format(len(set(x[0] for x in samfiles))))

    fw = must_open(opts.outfile, "w")
    for seqid, d in groupby(data, lambda x: x[0]):
        d = list(d)
        nmarkers = len(d)
        logging.debug("Process contig {0} ({1} markers)".format(seqid, nmarkers))
        haplotype_set = []
        for pf, sf in groupby(samfiles, key=lambda x: x[0]):
            haplotypes = []
            for pfi, samfile in sf:
                reads = defaultdict(list)
                positions = []
                for s, pos, ref, alt, c, hetratio in d:
                    for c in samfile.pileup(seqid):
                        if c.reference_pos != pos - 1:
                            continue
                        for r in c.pileups:
                            rname = r.alignment.query_name
                            rbase = r.alignment.query_sequence[r.query_position]
                            reads[rname].append((pos, rbase))
                    positions.append(pos)
                for read in reads.values():
                    hap = ['-'] * nmarkers
                    for p, rbase in read:
                        hap[positions.index(p)] = rbase
                    hap = "".join(hap)
                    if "-" in hap:
                        continue
                    haplotypes.append(hap)
            haplotypes = set(haplotypes)
            haplotype_set.append(haplotypes)
        hr = HaplotypeResolver(haplotype_set)
        print >> fw, seqid, hr
        hr.solve(fw)
Esempio n. 27
0
def resolve(args):
    """
    %prog resolve matrixfile fastafile bamfolder

    Separate repeats along collapsed contigs. First scan the matrixfile for
    largely heterozygous sites. For each heterozygous site, we scan each bam to
    retrieve distinct haplotypes. The frequency of each haplotype is then
    computed, the haplotype with the highest frequency, assumed to be
    paralogous, is removed.
    """
    import pysam
    from collections import defaultdict
    from itertools import groupby

    p = OptionParser(resolve.__doc__)
    p.add_option("--missing", default=.5, help="Maximum level of missing data")
    p.add_option("--het",
                 default=.5,
                 help="Maximum level of heterozygous calls")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    matrixfile, fastafile, bamfolder = args
    #f = Fasta(fastafile)
    fp = open(matrixfile)
    for row in fp:
        if row[0] != '#':
            break
    header = row.split()
    ngenotypes = len(header) - 4
    nmissing = int(round(opts.missing * ngenotypes))
    logging.debug("A total of {0} individuals scanned".format(ngenotypes))
    logging.debug("Look for markers with < {0} missing and > {1} het".\
                    format(opts.missing, opts.het))
    bamfiles = iglob(bamfolder, "*.bam")
    logging.debug("Folder `{0}` contained {1} bam files".\
                    format(bamfolder, len(bamfiles)))

    data = []
    for row in fp:
        if row[0] == '#':
            continue
        atoms = row.split()
        seqid, pos, ref, alt = atoms[:4]
        genotypes = atoms[4:]
        c = Counter(genotypes)
        c0 = c.get('0', 0)
        c3 = c.get('3', 0)
        if c0 >= nmissing:
            continue
        hetratio = c3 * 1. / (ngenotypes - c0)
        if hetratio <= opts.het:
            continue
        pos = int(pos)
        data.append((seqid, pos, ref, alt, c, hetratio))

    data.sort()
    logging.debug("A total of {0} target markers in {1} contigs.".\
                    format(len(data), len(set(x[0] for x in data))))
    samfiles = [pysam.AlignmentFile(x, "rb") for x in bamfiles]
    samfiles = [(op.basename(x.filename).split(".")[0], x) for x in samfiles]
    samfiles.sort()
    logging.debug("BAM files grouped to {0} individuals".\
                    format(len(set(x[0] for x in samfiles))))

    fw = must_open(opts.outfile, "w")
    for seqid, d in groupby(data, lambda x: x[0]):
        d = list(d)
        nmarkers = len(d)
        logging.debug("Process contig {0} ({1} markers)".format(
            seqid, nmarkers))
        haplotype_set = []
        for pf, sf in groupby(samfiles, key=lambda x: x[0]):
            haplotypes = []
            for pfi, samfile in sf:
                reads = defaultdict(list)
                positions = []
                for s, pos, ref, alt, c, hetratio in d:
                    for c in samfile.pileup(seqid):
                        if c.reference_pos != pos - 1:
                            continue
                        for r in c.pileups:
                            rname = r.alignment.query_name
                            rbase = r.alignment.query_sequence[
                                r.query_position]
                            reads[rname].append((pos, rbase))
                    positions.append(pos)
                for read in reads.values():
                    hap = ['-'] * nmarkers
                    for p, rbase in read:
                        hap[positions.index(p)] = rbase
                    hap = "".join(hap)
                    if "-" in hap:
                        continue
                    haplotypes.append(hap)
            haplotypes = set(haplotypes)
            haplotype_set.append(haplotypes)
        hr = HaplotypeResolver(haplotype_set)
        print >> fw, seqid, hr
        hr.solve(fw)
Esempio n. 28
0
def score(args):
    """
    %prog score main_results/ cached_data/ contigsfasta

    Score the current LACHESIS CLM.
    """
    p = OptionParser(score.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    mdir, cdir, contigsfasta = args
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
    sizes = Sizes(contigsfasta)
    contig_names = list(sizes.iter_names())
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))

    oo = []
    # Load contact matrix
    glm = op.join(cdir, "all.GLM")
    N = len(contig_ids)
    M = np.zeros((N, N), dtype=int)
    fp = open(glm)
    for row in fp:
        if row[0] == '#':
            continue
        x, y, z = row.split()
        if x == 'X':
            continue
        M[int(x), int(y)] = int(z)

    fwtour = open("tour", "w")

    def callback(tour, gen, oo):
        fitness = tour.fitness if hasattr(tour, "fitness") else None
        label = "GA-{0}".format(gen)
        if fitness:
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
            label += "-" + fitness
        print_tour(fwtour, tour, label, contig_names, oo)
        return tour

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        for x in co:
            contig_id = contig_ids[x.contig_name]
            oo.append(contig_id)
        pf = op.basename(ofile).split(".")[0]
        print pf
        print oo

        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
        # Store INIT tour
        print_tour(fwtour, tour, "INIT", contig_names, oo)

        # Faster Cython version for evaluation
        from .chic import score_evaluate_M
        callbacki = partial(callback, oo=oo)
        toolbox = GA_setup(tour)
        toolbox.register("evaluate",
                         score_evaluate_M,
                         tour_sizes=tour_sizes,
                         tour_M=tour_M)
        tour, tour.fitness = GA_run(toolbox,
                                    npop=100,
                                    cpus=opts.cpus,
                                    callback=callbacki)
        print tour, tour.fitness
        break

    fwtour.close()
Esempio n. 29
0
File: hic.py Progetto: xuanblo/jcvi
def score(args):
    """
    %prog score main_results/ cached_data/ contigsfasta

    Score the current LACHESIS CLM.
    """
    p = OptionParser(score.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    mdir, cdir, contigsfasta = args
    orderingfiles = natsorted(iglob(mdir, "*.ordering"))
    sizes = Sizes(contigsfasta)
    contig_names = list(sizes.iter_names())
    contig_ids = dict((name, i) for (i, name) in enumerate(contig_names))

    oo = []
    # Load contact matrix
    glm = op.join(cdir, "all.GLM")
    N = len(contig_ids)
    M = np.zeros((N, N), dtype=int)
    fp = open(glm)
    for row in fp:
        if row[0] == '#':
            continue
        x, y, z = row.split()
        if x == 'X':
            continue
        M[int(x), int(y)] = int(z)

    fwtour = open("tour", "w")

    def callback(tour, gen, oo):
        fitness = tour.fitness if hasattr(tour, "fitness") else None
        label = "GA-{0}".format(gen)
        if fitness:
            fitness = "{0}".format(fitness).split(",")[0].replace("(", "")
            label += "-" + fitness
        print_tour(fwtour, tour, label, contig_names, oo)
        return tour

    for ofile in orderingfiles:
        co = ContigOrdering(ofile)
        for x in co:
            contig_id = contig_ids[x.contig_name]
            oo.append(contig_id)
        pf = op.basename(ofile).split(".")[0]
        print pf
        print oo

        tour, tour_sizes, tour_M = prepare_ec(oo, sizes, M)
        # Store INIT tour
        print_tour(fwtour, tour, "INIT", contig_names, oo)

        # Faster Cython version for evaluation
        from .chic import score_evaluate_M
        callbacki = partial(callback, oo=oo)
        toolbox = GA_setup(tour)
        toolbox.register("evaluate", score_evaluate_M,
                         tour_sizes=tour_sizes, tour_M=tour_M)
        tour, tour.fitness = GA_run(toolbox, npop=100, cpus=opts.cpus,
                                    callback=callbacki)
        print tour, tour.fitness
        break

    fwtour.close()
Esempio n. 30
0
def scan_read_files(trimmed, patterns):
    reads = iglob(trimmed, patterns)
    samples = sorted(set(op.basename(x).split(".")[0] for x in reads))
    logging.debug("Total {0} read files from {1} samples".\
                    format(len(reads), len(samples)))
    return reads, samples
Esempio n. 31
0
def prepare(args):
    """
    %prog prepare [--options] folder [genome.fasta]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN

    If genome.fasta is provided, prepare script for GG-Trinity.
    If coord-sorted BAM is provided, then it will use it as starting point.

    Since GG-Trinity jobs are partitioned DN-Trinity jobs run on relatively small
    regions, lesser amount of CPU can be specified for each DN job using `--gg_cpu`
    In such cases, the `--cpu` should be set to a larger value to help speedup
    upstream steps such as GSNAP read mapping or coordinate sorting of BAM files.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]
    genome = args[1] if len(args) == 2 else None
    method = "GG" if genome is not None else "DN"

    paired = opts.paired
    merge = opts.merge
    thome = opts.trinity_home
    use_bam = opts.use_bam
    gg_cpu = opts.gg_cpu

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = iglob("../" + inparam, "*.fq", "*.fastq", "*.fq.gz", "*.fastq.gz")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x]
        assert len(f1) == len(f2)
        if merge:
            r1, r2 = "left.fastq", "right.fastq"
            reads = ((f1, r1), (f2, r2))
    else:
        if merge:
            r = "single.fastq"
            reads = ((flist, r), )

    if merge:
        for fl, r in reads:
            fm = FileMerger(fl, r)
            fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity")
    cmd += " --seqType fq --JM {0} --CPU {1}".format(opts.JM, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)
    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome {0} --genome_guided_max_intron {1}".format(genome, opts.max_intron)
        if use_bam:
            cmd += " --genome_guided_use_bam {0}".format(use_bam)
        if gg_cpu:
            cmd += " --genome_guided_CPU {0}".format(gg_cpu)
    if opts.grid and opts.grid_conf_file:
        cmd += " --grid_conf_file={0}".format(opts.grid_conf_file)

    if paired:
        if merge:
            cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
        else:
            for lf, rf in zip(f1, f2):
                cmd += " --left {0}".format(lf)
                cmd += " --right {0}".format(rf)
    else:
        if merge:
             cmd += " --single {0}".format(reads[0][-1])
        else:
            for f in flist:
                cmd += " --single {0}".format(f)
    if opts.extra:
        cmd += " {0}".format(opts.extra)

    runfile = "run.sh"
    write_file(runfile, cmd)
    os.chdir(cwd)
Esempio n. 32
0
def prepare(args):
    """
    %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN-Trinity.

    If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM
    as starting point.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_fastq_names()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]

    paired = opts.paired
    merge = opts.merge
    trinity_home = opts.trinity_home
    hpc_grid_runner_home = opts.hpcgridrunner_home

    method = "DN"
    bam = opts.bam
    if bam and op.exists(bam):
        bam = op.abspath(bam)
        method = "GG"

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    cmds = []

    # set TRINITY_HOME env variable when preparing shell script
    env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home)
    cmds.append(env_cmd)

    if method == "DN":
        assert op.exists("../" + inparam)

        flist = iglob("../" + inparam, opts.names)
        if paired:
            f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x]
            f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x]
            assert len(f1) == len(f2)
            if merge:
                r1, r2 = "left.fastq", "right.fastq"
                reads = ((f1, r1), (f2, r2))
        else:
            if merge:
                r = "single.fastq"
                reads = ((flist, r), )

        if merge:
            for fl, r in reads:
                fm = FileMerger(fl, r)
                fm.merge(checkexists=True)

    cmd = op.join(trinity_home, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)

    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome_guided_bam {0}".format(bam)
        cmd += " --genome_guided_max_intron {0}".format(opts.max_intron)
    else:
        if paired:
            if merge:
                cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
            else:
                cmd += " --left {0}".format(",".join(f1))
                cmd += " --right {0}".format(",".join(f2))
        else:
            if merge:
                 cmd += " --single {0}".format(reads[0][-1])
            else:
                for f in flist:
                    cmd += " --single {0}".format(f)

    if opts.grid and opts.grid_conf_file:
        hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl")
        hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file)
        assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file)

        cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file)

    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmds.append(cmd)

    if opts.cleanup:
        cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \
            if method == "DN" else \
            'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")'
        cmd.append(cleanup_cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(cmds))
    os.chdir(cwd)