Example #1
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Example #2
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Example #3
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()
Example #4
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=94)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()
Example #5
0
def prepare(args):
    """
    %prog prepare [--options] folder [--bam rnaseq.coordSorted.bam]

    Run Trinity on a folder of reads.  When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain the patterns
    ("_1_" and "_2_") or (".1." and ".2.") or ("_1." and "_2.").

    By default, prepare script for DN-Trinity.

    If coord-sorted BAM is provided, prepare script for GG-Trinity, using BAM
    as starting point.

    Newer versions of trinity can take multiple fastq files as input.
    If "--merge" is specified, the fastq files are merged together before assembling
    """
    p = OptionParser(prepare.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.add_option("--merge", default=False, action="store_true",
                 help="Merge individual input fastq's into left/right/single" + \
                      " file(s) [default: %default]")
    p.set_trinity_opts()
    p.set_fastq_names()
    p.set_grid()
    opts, args = p.parse_args(args)

    if len(args) not in (1, 2):
        sys.exit(not p.print_help())

    inparam, = args[:1]

    paired = opts.paired
    merge = opts.merge
    trinity_home = opts.trinity_home
    hpc_grid_runner_home = opts.hpcgridrunner_home

    method = "DN"
    bam = opts.bam
    if bam and op.exists(bam):
        bam = op.abspath(bam)
        method = "GG"

    pf = inparam.split(".")[0]
    tfolder = "{0}_{1}".format(pf, method)

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    cmds = []

    # set TRINITY_HOME env variable when preparing shell script
    env_cmd = 'export TRINITY_HOME="{0}"'.format(trinity_home)
    cmds.append(env_cmd)

    if method == "DN":
        assert op.exists("../" + inparam)

        flist = iglob("../" + inparam, opts.names)
        if paired:
            f1 = [x for x in flist if "_1_" in x or ".1." in x or "_1." in x or "_R1" in x]
            f2 = [x for x in flist if "_2_" in x or ".2." in x or "_2." in x or "_R2" in x]
            assert len(f1) == len(f2)
            if merge:
                r1, r2 = "left.fastq", "right.fastq"
                reads = ((f1, r1), (f2, r2))
        else:
            if merge:
                r = "single.fastq"
                reads = ((flist, r), )

        if merge:
            for fl, r in reads:
                fm = FileMerger(fl, r)
                fm.merge(checkexists=True)

    cmd = op.join(trinity_home, "Trinity")
    cmd += " --seqType fq --max_memory {0} --CPU {1}".format(opts.max_memory, opts.cpus)
    cmd += " --min_contig_length {0}".format(opts.min_contig_length)

    if opts.bflyGCThreads:
        cmd += " --bflyGCThreads {0}".format(opts.bflyGCThreads)

    if method == "GG":
        cmd += " --genome_guided_bam {0}".format(bam)
        cmd += " --genome_guided_max_intron {0}".format(opts.max_intron)
    else:
        if paired:
            if merge:
                cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
            else:
                cmd += " --left {0}".format(",".join(f1))
                cmd += " --right {0}".format(",".join(f2))
        else:
            if merge:
                 cmd += " --single {0}".format(reads[0][-1])
            else:
                for f in flist:
                    cmd += " --single {0}".format(f)

    if opts.grid and opts.grid_conf_file:
        hpc_grid_runner = op.join(hpc_grid_runner_home, "hpc_cmds_GridRunner.pl")
        hpc_grid_conf_file = op.join(hpc_grid_runner_home, "hpc_conf", opts.grid_conf_file)
        assert op.exists(hpc_grid_conf_file), "HpcGridRunner conf file does not exist: {0}".format(hpc_grid_conf_file)

        cmd += ' --grid_exec "{0} --grid_conf {1} -c"'.format(hpc_grid_runner, hpc_grid_conf_file)

    if opts.extra:
        cmd += " {0}".format(opts.extra)

    cmds.append(cmd)

    if opts.cleanup:
        cleanup_cmd = 'rm -rf !("Trinity.fasta"|"Trinity.gene_trans_map"|"Trinity.timing")' \
            if method == "DN" else \
            'rm -rf !("Trinity-GG.fasta"|"Trinity-GG.gene_trans_map"|"Trinity.timing")'
        cmd.append(cleanup_cmd)

    runfile = "run.sh"
    write_file(runfile, "\n".join(cmds))
    os.chdir(cwd)
Example #6
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()
Example #7
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=95)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()