Ejemplo n.º 1
0
Archivo: cnv.py Proyecto: xuanblo/jcvi
def batchccn(args):
    """
    %prog batchccn test.csv

    Run CCN script in batch. Write makefile.
    """
    p = OptionParser(batchccn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    csvfile, = args
    mm = MakeManager()
    pf = op.basename(csvfile).split(".")[0]
    mkdir(pf)

    header = open(csvfile).next()
    header = None if header.strip().endswith(".bam") else "infer"
    logging.debug("Header={}".format(header))
    df = pd.read_csv(csvfile, header=header)
    cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl"
    cmd += " -n {} -b {}"
    cmd += " -o {} -r hg38".format(pf)
    for i, (sample_key, bam) in df.iterrows():
        cmdi = cmd.format(sample_key, bam)
        outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key)
        mm.add(csvfile, outfile, cmdi)
    mm.write()
Ejemplo n.º 2
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option(
        "--notreds",
        default=False,
        action="store_true",
        help="Remove TREDs from the bed file",
    )
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print(r, file=newbed)
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 3
0
def impute(args):
    """
    %prog impute input.vcf hs37d5.fa 1

    Use IMPUTE2 to impute vcf on chromosome 1.
    """
    from pyfaidx import Fasta

    p = OptionParser(impute.__doc__)
    p.set_home("shapeit")
    p.set_home("impute")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    vcffile, fastafile, chr = args
    mm = MakeManager()
    pf = vcffile.rsplit(".", 1)[0]
    hapsfile = pf + ".haps"
    kg = op.join(opts.ref, "1000GP_Phase3")
    shapeit_phasing(mm, chr, vcffile, opts)

    fasta = Fasta(fastafile)
    size = len(fasta[chr])
    binsize = 5000000
    bins = size / binsize  # 5Mb bins
    if size % binsize:
        bins += 1
    impute_cmd = op.join(opts.impute_home, "impute2")
    chunks = []
    for x in xrange(bins + 1):
        chunk_start = x * binsize + 1
        chunk_end = min(chunk_start + binsize - 1, size)
        outfile = pf + ".chunk{0:02d}.impute2".format(x)
        mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr)
        rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr)
        cmd = impute_cmd + " -m {0}".format(mapfile)
        cmd += " -known_haps_g {0}".format(hapsfile)
        cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf)
        cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end)
        cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile)
        cmd += " && touch {0}".format(outfile)
        mm.add(hapsfile, outfile, cmd)
        chunks.append(outfile)

    # Combine all the files
    imputefile = pf + ".impute2"
    cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile)
    mm.add(chunks, imputefile, cmd)

    # Convert to vcf
    vcffile = pf + ".impute2.vcf"
    cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\
                format(imputefile, fastafile, chr, vcffile)
    mm.add(imputefile, vcffile, cmd)
    mm.write()
Ejemplo n.º 4
0
def minimap(args):
    """
    %prog minimap ref.fasta query.fasta

    Wrap minimap2 aligner using query against sequences. When query and ref
    is the same, we are in "self-scan" mode (e.g. useful for finding internal
    duplications resulted from mis-assemblies).
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.formats.fasta import Fasta

    p = OptionParser(minimap.__doc__)
    p.add_option(
        "--chunks",
        type="int",
        default=2000000,
        help="Split ref.fasta into chunks of size in self-scan mode",
    )
    p.set_outdir(outdir="outdir")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    chunks = opts.chunks
    outdir = opts.outdir
    if ref != query:
        raise NotImplementedError

    # "self-scan" mode
    # build faidx (otherwise, parallel make may complain)
    sh("samtools faidx {}".format(ref))
    f = Fasta(ref)
    mkdir(outdir)
    mm = MakeManager()
    for name, size in f.itersizes():
        start = 0
        for end in range(chunks, size, chunks):
            fafile = op.join(outdir,
                             "{}_{}_{}.fa".format(name, start + 1, end))
            cmd = "samtools faidx {} {}:{}-{} -o {}".format(
                ref, name, start + 1, end, fafile)
            mm.add(ref, fafile, cmd)

            paffile = fafile.rsplit(".", 1)[0] + ".paf"
            cmd = "minimap2 -P {} {} > {}".format(fafile, fafile, paffile)
            mm.add(fafile, paffile, cmd)

            epsfile = fafile.rsplit(".", 1)[0] + ".eps"
            cmd = "minidot {} > {}".format(paffile, epsfile)
            mm.add(paffile, epsfile, cmd)
            start += chunks

    mm.write()
Ejemplo n.º 5
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq",
                 action="store_true",
                 default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 6
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 <= {} && $9 >= 0)'".format(datfile, READLEN)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Ejemplo n.º 7
0
def blasr(args):
    """
    %prog blasr ref.fasta fofn

    Run blasr on a set of PacBio reads. This is based on a divide-and-conquer
    strategy described below.
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.utils.iter import grouper

    p = OptionParser(blasr.__doc__)
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, fofn = args
    flist = sorted([x.strip() for x in open(fofn)])
    h5list = []
    mm = MakeManager()
    for i, fl in enumerate(grouper(flist, 3)):
        chunkname = "chunk{0:03d}".format(i)
        fn = chunkname + ".fofn"
        h5 = chunkname + ".cmp.h5"
        fw = open(fn, "w")
        print >> fw, "\n".join(fl)
        fw.close()

        cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5)
        cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus)
        mm.add((fn, reffasta), h5, cmd)
        h5list.append(h5)

    # Merge h5, sort and repack
    allh5 = "all.cmp.h5"
    tmph5 = "tmp.cmp.h5"
    cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5)
    cmd_merge += " " + " ".join(h5list)
    cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5)
    cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5)
    cmd_repack += " && mv {0} {1}".format(tmph5, allh5)
    mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack])

    # Quiver
    pf = reffasta.rsplit(".", 1)[0]
    variantsgff = pf + ".variants.gff"
    consensusfasta = pf + ".consensus.fasta"
    cmd_faidx = "samtools faidx {0}".format(reffasta)
    cmd = "quiver -j 32 {0}".format(allh5)
    cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff,
                                          consensusfasta)
    mm.add(allh5, consensusfasta, [cmd_faidx, cmd])

    mm.write()
Ejemplo n.º 8
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--notreds", default=False, action="store_true",
                 help="Remove TREDs from the bed file")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print >> newbed, r
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 9
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    gtf = opts.gtf
    transcripts = "transcripts.gtf"

    mm = MakeManager()
    gtfs = []
    for bam in iglob(folder, "*.bam"):
        pf = op.basename(bam).split(".")[0]
        outdir = pf + "_cufflinks"
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(cpus)
        if gtf:
            cmd += " -g {0}".format(gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        cgtf = op.join(outdir, transcripts)
        mm.add(bam, cgtf, cmd)
        gtfs.append(cgtf)

    assemblylist = "assembly_list.txt"
    cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist)
    mm.add(gtfs, assemblylist, cmd)

    mergedgtf = "merged/merged.gtf"
    cmd = "cuffmerge"
    cmd += " -o merged"
    cmd += " -p {0}".format(cpus)
    if gtf:
        cmd += " -g {0}".format(gtf)
    cmd += " -s {0}".format(reference)
    cmd += " {0}".format(assemblylist)
    mm.add(assemblylist, mergedgtf, cmd)

    mm.write()
Ejemplo n.º 10
0
Archivo: kmer.py Proyecto: xuanblo/jcvi
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci", default=2, type="int",
                 help="Exclude kmers with less than ci counts")
    p.add_option("--cs", default=2, type="int",
                 help="Maximal value of a counter")
    p.add_option("--cx", default=None, type="int",
                 help="Exclude kmers with more than cx counts")
    p.add_option("--single", default=False, action="store_true",
                 help="Input is single-end data, only one FASTQ/FASTA")
    p.add_option("--fasta", default=False, action="store_true",
                 help="Input is FASTA instead of FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \
              "*.fq,*.fq.gz,*.fastq,*.fastq.gz"

    mm = MakeManager()
    for p, pf in iter_project(folder, pattern=pattern,
                              n=n, commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        if opts.cx:
            cmd += " -cx{}".format(opts.cx)
        if opts.fasta:
            cmd += " -fm"
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Ejemplo n.º 11
0
def lastgenome(args):
    """
    %prog genome_A.fasta genome_B.fasta

    Run LAST by calling LASTDB, LASTAL. The script runs the following steps:
    $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa
    $ lastal -E0.05 -C2 Chr10A-NEAR Chr10A.fa -fTAB > Chr10A.Chr10A.tab
    $ last-dotplot Chr10A.Chr10A.tab
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(lastgenome.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gA, gB = args
    mm = MakeManager()
    bb = lambda x : op.basename(x).rsplit(".", 1)[0]
    gA_pf, gB_pf = bb(gA), bb(gB)

    # Build LASTDB
    dbname = "-".join((gA_pf, "NEAR"))
    dbfile = dbname + ".suf"
    build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA)
    mm.add(gA, dbfile, build_db_cmd)

    # Run LASTAL
    tabfile = "{}.{}.tab".format(gA_pf, gB_pf)
    lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB)
    lastal_cmd += " -fTAB > {}".format(tabfile)
    mm.add([dbfile, gB], tabfile, lastal_cmd)

    mm.write()
Ejemplo n.º 12
0
def batch(args):
    """
    %prog batch all.cds *.anchors

    Compute Ks values for a set of anchors file. This will generate a bunch of
    work directories for each comparisons. The anchorsfile should be in the form
    of specie1.species2.anchors.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(batch.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    cdsfile = args[0]
    anchors = args[1:]
    workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors]
    for wd in workdirs:
        mkdir(wd)

    mm = MakeManager()
    for wd, ac in zip(workdirs, anchors):
        pairscdsfile = wd + ".cds.fasta"
        cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\
                format(ac, cdsfile, pairscdsfile)
        mm.add((ac, cdsfile), pairscdsfile, cmd)
        ksfile = wd + ".ks"
        cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\
                format(pairscdsfile, ksfile, wd)
        mm.add(pairscdsfile, ksfile, cmd)
    mm.write()
Ejemplo n.º 13
0
def lastgenomeuniq(args):
    """
    %prog genome_A.fasta genome_B.fasta

    Run LAST by calling LASTDB, LASTAL and LAST-SPLIT. The recipe is based on
    tutorial here:

    <https://github.com/mcfrith/last-genome-alignments>

    The script runs the following steps:
    $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa
    $ lastal -E0.05 -C2 Chr10A-NEAR Chr10B.fa | last-split -m1 | maf-swap | last-split -m1 -fMAF > Chr10A.Chr10B.1-1.maf
    $ maf-convert -n blasttab Chr10A.Chr10B.1-1.maf > Chr10A.Chr10B.1-1.blast

    Works with LAST v959.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(lastgenome.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gA, gB = args
    mm = MakeManager()
    bb = lambda x : op.basename(x).rsplit(".", 1)[0]
    gA_pf, gB_pf = bb(gA), bb(gB)

    # Build LASTDB
    dbname = "-".join((gA_pf, "NEAR"))
    dbfile = dbname + ".suf"
    build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA)
    mm.add(gA, dbfile, build_db_cmd)

    # Run LASTAL
    maffile = "{}.{}.1-1.maf".format(gA_pf, gB_pf)
    lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB)
    lastal_cmd += " | last-split -m1"
    lastal_cmd += " | maf-swap"
    lastal_cmd += " | last-split -m1 -fMAF > {}".format(maffile)
    mm.add([dbfile, gB], maffile, lastal_cmd)

    # Convert to BLAST format
    blastfile = maffile.replace(".maf", ".blast")
    convert_cmd = "maf-convert -n blasttab {} > {}".format(maffile, blastfile)
    mm.add(maffile, blastfile, convert_cmd)

    mm.write()
Ejemplo n.º 14
0
Archivo: str.py Proyecto: Hensonmw/jcvi
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq", action="store_true", default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Ejemplo n.º 15
0
def blasr(args):
    """
    %prog blasr ref.fasta fofn

    Run blasr on a set of PacBio reads. This is based on a divide-and-conquer
    strategy described below.
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.utils.iter import grouper

    p = OptionParser(blasr.__doc__)
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, fofn = args
    flist = sorted([x.strip() for x in open(fofn)])
    h5list = []
    mm = MakeManager()
    for i, fl in enumerate(grouper(flist, 3)):
        chunkname = "chunk{0:03d}".format(i)
        fn = chunkname + ".fofn"
        h5 = chunkname + ".cmp.h5"
        fw = open(fn, "w")
        print >> fw, "\n".join(fl)
        fw.close()

        cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5)
        cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus)
        mm.add((fn, reffasta), h5, cmd)
        h5list.append(h5)

    # Merge h5, sort and repack
    allh5 = "all.cmp.h5"
    tmph5 = "tmp.cmp.h5"
    cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5)
    cmd_merge += " " + " ".join(h5list)
    cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5)
    cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5)
    cmd_repack += " && mv {0} {1}".format(tmph5, allh5)
    mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack])

    # Quiver
    pf = reffasta.rsplit(".", 1)[0]
    variantsgff = pf + ".variants.gff"
    consensusfasta = pf + ".consensus.fasta"
    cmd_faidx = "samtools faidx {0}".format(reffasta)
    cmd = "quiver -j 32 {0}".format(allh5)
    cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta)
    mm.add(allh5, consensusfasta, [cmd_faidx, cmd])

    mm.write()
Ejemplo n.º 16
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    gtf = opts.gtf
    transcripts = "transcripts.gtf"

    mm = MakeManager()
    gtfs = []
    for bam in iglob(folder, "*.bam"):
        pf = op.basename(bam).split(".")[0]
        outdir = pf + "_cufflinks"
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(cpus)
        if gtf:
            cmd += " -g {0}".format(gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        cgtf = op.join(outdir, transcripts)
        mm.add(bam, cgtf, cmd)
        gtfs.append(cgtf)

    assemblylist = "assembly_list.txt"
    cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist)
    mm.add(gtfs, assemblylist, cmd)

    mergedgtf = "merged/merged.gtf"
    cmd = "cuffmerge"
    cmd += " -o merged"
    cmd += " -p {0}".format(cpus)
    if gtf:
        cmd += " -g {0}".format(gtf)
    cmd += " -s {0}".format(reference)
    cmd += " {0}".format(assemblylist)
    mm.add(assemblylist, mergedgtf, cmd)

    mm.write()
Ejemplo n.º 17
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch", default=31, type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore", default=MINSCORE, type="int",
                 help="Minimum score to report")
    p.add_option("--period", default=6, type="int",
                 help="Maximum period to report")
    p.add_option("--minlength", default=MINSCORE / 2, type="int",
                 help="Minimum length of repeat tract")
    p.add_option("--telomeres", default=False, action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minlength
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).split(".")[0]
        cmd1 = "trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | awk '($8 >= {} && $8 <= {})'".\
                    format(datfile, minlength, READLEN - minlength)
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Ejemplo n.º 18
0
Archivo: ks.py Proyecto: ascendo/jcvi
def batch(args):
    """
    %prog batch all.cds *.anchors

    Compute Ks values for a set of anchors file. This will generate a bunch of
    work directories for each comparisons. The anchorsfile should be in the form
    of specie1.species2.anchors.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(batch.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    cdsfile = args[0]
    anchors = args[1:]
    workdirs = [".".join(op.basename(x).split(".")[:2]) for x in anchors]
    for wd in workdirs:
        mkdir(wd)

    mm = MakeManager()
    for wd, ac in zip(workdirs, anchors):
        pairscdsfile = wd + ".cds.fasta"
        cmd = "python -m jcvi.apps.ks prepare {} {} -o {}".\
                format(ac, cdsfile, pairscdsfile)
        mm.add((ac, cdsfile), pairscdsfile, cmd)
        ksfile = wd + ".ks"
        cmd = "python -m jcvi.apps.ks calc {} -o {} --workdir {}".\
                format(pairscdsfile, ksfile, wd)
        mm.add(pairscdsfile, ksfile, cmd)
    mm.write()
Ejemplo n.º 19
0
def lastgenome(args):
    """
    %prog genome_A.fasta genome_B.fasta

    Run LAST by calling LASTDB, LASTAL and LAST-SPLIT. The recipe is based on
    tutorial here:

    <https://github.com/mcfrith/last-genome-alignments>

    The script runs the following steps:
    $ lastdb -P0 -uNEAR -R01 Chr10A-NEAR Chr10A.fa
    $ lastal -E0.05 -C2 Chr10A-NEAR Chr10B.fa | last-split -m1 | maf-swap | last-split -m1 -fMAF > Chr10A.Chr10B.1-1.maf
    $ maf-convert -n blasttab Chr10A.Chr10B.1-1.maf > Chr10A.Chr10B.1-1.blast

    Works with LAST v959.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(lastgenome.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gA, gB = args
    mm = MakeManager()
    bb = lambda x : op.basename(x).rsplit(".", 1)[0]
    gA_pf, gB_pf = bb(gA), bb(gB)

    # Build LASTDB
    dbname = "-".join((gA_pf, "NEAR"))
    dbfile = dbname + ".suf"
    build_db_cmd = "lastdb -P0 -uNEAR -R01 {} {}".format(dbfile, gA)
    mm.add(gA, dbfile, build_db_cmd)

    # Run LASTAL
    maffile = "{}.{}.1-1.maf".format(gA_pf, gB_pf)
    lastal_cmd = "lastal -E0.05 -C2 {} {}".format(dbname, gB)
    lastal_cmd += " | last-split -m1"
    lastal_cmd += " | maf-swap"
    lastal_cmd += " | last-split -m1 -fMAF > {}".format(maffile)
    mm.add([dbfile, gB], maffile, lastal_cmd)

    # Convert to BLAST format
    blastfile = maffile.replace(".maf", ".blast")
    convert_cmd = "maf-convert -n blasttab {} > {}".format(maffile, blastfile)
    mm.add(maffile, blastfile, convert_cmd)

    mm.write()
Ejemplo n.º 20
0
def cyntenator(args):
    """
    %prog cyntenator athaliana.athaliana.last athaliana.bed

    Prepare input for Cyntenator.
    """
    p = OptionParser(cyntenator.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    fp = open(lastfile)
    filteredlastfile = lastfile + ".blast"
    fw = open(filteredlastfile, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print >> fw, "\t".join((b.query, b.subject, str(b.score)))
    fw.close()

    bedfiles = args[1:]
    fp = open(lastfile)
    b = BlastLine(fp.next())
    subject = b.subject
    txtfiles = []
    for bedfile in bedfiles:
        order = Bed(bedfile).order
        if subject in order:
            db = op.basename(bedfile).split(".")[0][:20]
            logging.debug("Found db: {0}".format(db))
        txtfile = write_txt(bedfile)
        txtfiles.append(txtfile)

    db += ".txt"
    mm = MakeManager()
    for txtfile in txtfiles:
        outfile = txtfile + ".alignment"
        cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\
                .format(txtfile, db, filteredlastfile, outfile)
        mm.add((txtfile, db, filteredlastfile), outfile, cmd)
    mm.write()
Ejemplo n.º 21
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Ejemplo n.º 22
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks", type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-l 100 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus ** .5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Ejemplo n.º 23
0
def batchccn(args):
    """
    %prog batchccn test.csv

    Run CCN script in batch. Write makefile.
    """
    p = OptionParser(batchccn.__doc__)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (csvfile, ) = args
    mm = MakeManager()
    pf = op.basename(csvfile).split(".")[0]
    mkdir(pf)

    header = next(open(csvfile))
    header = None if header.strip().endswith(".bam") else "infer"
    logging.debug("Header={}".format(header))
    df = pd.read_csv(csvfile, header=header)
    cmd = "perl /mnt/software/ccn_gcn_hg38_script/ccn_gcn_hg38.pl"
    cmd += " -n {} -b {}"
    cmd += " -o {} -r hg38".format(pf)
    for i, (sample_key, bam) in df.iterrows():
        cmdi = cmd.format(sample_key, bam)
        outfile = "{}/{}/{}.ccn".format(pf, sample_key, sample_key)
        mm.add(csvfile, outfile, cmdi)
    mm.write()
Ejemplo n.º 24
0
def beagle(args):
    """
    %prog beagle input.vcf 1

    Use BEAGLE4.1 to impute vcf on chromosome 1.
    """
    p = OptionParser(beagle.__doc__)
    p.set_home("beagle")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, chr = args
    pf = vcffile.rsplit(".", 1)[0]
    outpf = pf + ".beagle"
    outfile = outpf + ".vcf.gz"

    mm = MakeManager()
    beagle_cmd = opts.beagle_home
    kg = op.join(opts.ref, "1000GP_Phase3")
    cmd = beagle_cmd + " gt={0}".format(vcffile)
    cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr)
    cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr)
    cmd += " out={0}".format(outpf)
    cmd += " nthreads=16 gprobs=true"
    mm.add(vcffile, outfile, cmd)

    mm.write()
Ejemplo n.º 25
0
def batch(args):
    """
    %proj batch database.fasta project_dir output_dir

    Run bwa in batch mode.
    """
    p = OptionParser(batch.__doc__)
    set_align_options(p)
    p.set_sam_options()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref_fasta, proj_dir, outdir = args
    outdir = outdir.rstrip("/")
    s3dir = None
    if outdir.startswith("s3://"):
        s3dir = outdir
        outdir = op.basename(outdir)
        mkdir(outdir)

    mm = MakeManager()
    for p, pf in iter_project(proj_dir):
        targs = [ref_fasta] + p
        cmd1, bamfile = mem(targs, opts)
        if cmd1:
            cmd1 = output_bam(cmd1, bamfile)
        nbamfile = op.join(outdir, bamfile)
        cmd2 = "mv {} {}".format(bamfile, nbamfile)
        cmds = [cmd1, cmd2]

        if s3dir:
            cmd = "aws s3 cp {} {} --sse".format(nbamfile,
                                              op.join(s3dir, bamfile))
            cmds.append(cmd)

        mm.add(p, nbamfile, cmds)

    mm.write()
Ejemplo n.º 26
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci",
                 default=2,
                 type="int",
                 help="Exclude kmers with less than ci counts")
    p.add_option("--cs",
                 default=2,
                 type="int",
                 help="Maximal value of a counter")
    p.add_option("--cx",
                 default=None,
                 type="int",
                 help="Exclude kmers with more than cx counts")
    p.add_option("--single",
                 default=False,
                 action="store_true",
                 help="Input is single-end data, only one FASTQ/FASTA")
    p.add_option("--fasta",
                 default=False,
                 action="store_true",
                 help="Input is FASTA instead of FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    pattern = "*.fa,*.fa.gz,*.fasta,*.fasta.gz" if opts.fasta else \
              "*.fq,*.fq.gz,*.fastq,*.fastq.gz"

    mm = MakeManager()
    for p, pf in iter_project(folder, pattern=pattern, n=n,
                              commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print("\n".join(p), file=fw)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        if opts.cx:
            cmd += " -cx{}".format(opts.cx)
        if opts.fasta:
            cmd += " -fm"
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Ejemplo n.º 27
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.add_option("-c",
                 default=2,
                 type="int",
                 help="Maximal value of a counter")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    mm = MakeManager()
    for p, pf in iter_project(folder):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{} -cs{}".format(K, opts.cpus, opts.c)
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Ejemplo n.º 28
0
Archivo: kmer.py Proyecto: xuanblo/jcvi
def meryl(args):
    """
    %prog meryl folder

    Run meryl on Illumina reads.
    """
    p = OptionParser(meryl.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    cpus = opts.cpus
    mm = MakeManager()
    for p, pf in iter_project(folder):
        cmds = []
        mss = []
        for i, ip in enumerate(p):
            ms = "{}{}.ms{}".format(pf, i + 1, K)
            mss.append(ms)
            cmd = "meryl -B -C -m {} -threads {}".format(K, cpus)
            cmd += " -s {} -o {}".format(ip, ms)
            cmds.append(cmd)
        ams, bms = mss
        pms = "{}.ms{}".format(pf, K)
        cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms)
        cmds.append(cmd)
        cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".\
                    format(ams, ams, bms, bms)
        cmds.append(cmd)
        mm.add(p, pms + ".mcdat", cmds)

    mm.write()
Ejemplo n.º 29
0
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.set_sep(sep="_", help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmd = "samtools merge -@ 8 {0} {1}".format(target, source)
            mm.add(files, target, cmd, remove=True)
    mm.write()
Ejemplo n.º 30
0
def star(args):
    """
    %prog star folder reference

    Run star on a folder with reads.
    """
    p = OptionParser(star.__doc__)
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    mm = MakeManager()

    num = 1 if opts.single else 2
    folder, reference = args
    gd = "GenomeDir"
    mkdir(gd)
    STAR = "STAR --runThreadN {0} --genomeDir {1}".format(cpus, gd)

    # Step 0: build genome index
    genomeidx = op.join(gd, "Genome")
    if need_update(reference, genomeidx):
        cmd = STAR + " --runMode genomeGenerate"
        cmd += " --genomeFastaFiles {0}".format(reference)
        mm.add(reference, genomeidx, cmd)

    # Step 1: align
    for p, prefix in iter_project(folder, opts.names, num):
        pf = "{0}_star".format(prefix)
        bamfile = pf + "Aligned.sortedByCoord.out.bam"
        cmd = STAR + " --readFilesIn {0}".format(" ".join(p))
        if p[0].endswith(".gz"):
            cmd += " --readFilesCommand zcat"
        cmd += " --outSAMtype BAM SortedByCoordinate"
        cmd += " --outFileNamePrefix {0}".format(pf)
        cmd += " --twopassMode Basic"
        # Compatibility for cufflinks
        cmd += " --outSAMstrandField intronMotif"
        cmd += " --outFilterIntronMotifs RemoveNoncanonical"
        mm.add(p, bamfile, cmd)

    mm.write()
Ejemplo n.º 31
0
Archivo: sam.py Proyecto: rrane/jcvi
def merge(args):
    """
    %prog merge merged_bams bams1_dir bams2_dir ...

    Merge BAM files. Treat the bams with the same prefix as a set.
    Output the commands first.
    """
    from jcvi.apps.softlink import get_abs_path
    from jcvi.apps.grid import MakeManager

    p = OptionParser(merge.__doc__)
    p.add_option("--sep", default="_",
                 help="Separator to group per prefix")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    merged_bams = args[0]
    bamdirs = args[1:]

    mkdir(merged_bams)
    bams = []
    for x in bamdirs:
        bams += glob(op.join(x, "*.bam"))
    bams = [x for x in bams if "nsorted" not in x]

    logging.debug("Found a total of {0} BAM files.".format(len(bams)))

    sep = opts.sep
    key = lambda x: op.basename(x).split(sep)[0]
    bams.sort(key=key)
    mm = MakeManager()
    for prefix, files in groupby(bams, key=key):
        files = sorted(list(files))
        nfiles = len(files)
        source = " ".join(files)
        target = op.join(merged_bams, op.basename(files[0]))
        if nfiles == 1:
            source = get_abs_path(source)
            cmd = "ln -s {0} {1}".format(source, target)
            mm.add("", target, cmd)
        else:
            cmds = []
            cmds.append("rm {0}".format(target))
            cmds.append("samtools merge {0} {1}".format(target, source))
            mm.add(files, target, cmds)
    mm.write()
Ejemplo n.º 32
0
def cyntenator(args):
    """
    %prog cyntenator athaliana.athaliana.last athaliana.bed

    Prepare input for Cyntenator.
    """
    p = OptionParser(cyntenator.__doc__)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    lastfile = args[0]
    fp = open(lastfile)
    filteredlastfile = lastfile + ".blast"
    fw = open(filteredlastfile, "w")
    for row in fp:
        b = BlastLine(row)
        if b.query == b.subject:
            continue
        print("\t".join((b.query, b.subject, str(b.score))), file=fw)
    fw.close()

    bedfiles = args[1:]
    fp = open(lastfile)
    b = BlastLine(next(fp))
    subject = b.subject
    txtfiles = []
    for bedfile in bedfiles:
        order = Bed(bedfile).order
        if subject in order:
            db = op.basename(bedfile).split(".")[0][:20]
            logging.debug("Found db: {0}".format(db))
        txtfile = write_txt(bedfile)
        txtfiles.append(txtfile)

    db += ".txt"
    mm = MakeManager()
    for txtfile in txtfiles:
        outfile = txtfile + ".alignment"
        cmd = 'cyntenator -t "({0} {1})" -h blast {2} > {3}'\
                .format(txtfile, db, filteredlastfile, outfile)
        mm.add((txtfile, db, filteredlastfile), outfile, cmd)
    mm.write()
Ejemplo n.º 33
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks",
                 type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus**.5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Ejemplo n.º 34
0
def batch(args):
    """
    %proj batch database.fasta project_dir output_dir

    Run bwa in batch mode.
    """
    p = OptionParser(batch.__doc__)
    set_align_options(p)
    p.set_sam_options()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref_fasta, proj_dir, outdir = args
    outdir = outdir.rstrip("/")
    s3dir = None
    if outdir.startswith("s3://"):
        s3dir = outdir
        outdir = op.basename(outdir)
        mkdir(outdir)

    mm = MakeManager()
    for p, pf in iter_project(proj_dir):
        targs = [ref_fasta] + p
        cmd1, bamfile = mem(targs, opts)
        if cmd1:
            cmd1 = output_bam(cmd1, bamfile)
        nbamfile = op.join(outdir, bamfile)
        cmd2 = "mv {} {}".format(bamfile, nbamfile)
        cmds = [cmd1, cmd2]

        if s3dir:
            cmd = "aws s3 cp {} {} --sse".format(nbamfile,
                                                 op.join(s3dir, bamfile))
            cmds.append(cmd)

        mm.add(p, nbamfile, cmds)

    mm.write()
Ejemplo n.º 35
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci", default=2, type="int",
                 help="Minimum value of a counter")
    p.add_option("--cs", default=2, type="int",
                 help="Maximal value of a counter")
    p.add_option("--single", default=False, action="store_true",
                 help="Input is single-end data, only one FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    mm = MakeManager()
    for p, pf in iter_project(folder, n=n, commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Ejemplo n.º 36
0
def meryl(args):
    """
    %prog meryl folder

    Run meryl on Illumina reads.
    """
    p = OptionParser(meryl.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (folder, ) = args
    K = opts.k
    cpus = opts.cpus
    mm = MakeManager()
    for p, pf in iter_project(folder):
        cmds = []
        mss = []
        for i, ip in enumerate(p):
            ms = "{}{}.ms{}".format(pf, i + 1, K)
            mss.append(ms)
            cmd = "meryl -B -C -m {} -threads {}".format(K, cpus)
            cmd += " -s {} -o {}".format(ip, ms)
            cmds.append(cmd)
        ams, bms = mss
        pms = "{}.ms{}".format(pf, K)
        cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms)
        cmds.append(cmd)
        cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format(
            ams, ams, bms, bms)
        cmds.append(cmd)
        mm.add(p, pms + ".mcdat", cmds)

    mm.write()
Ejemplo n.º 37
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--simulation",
                 default=False,
                 action="store_true",
                 help="Simulation mode")
    p.set_home("lobstr",
               default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    if opts.simulation:  # Simulation mode
        cmd, vcf_file = allelotype_on_chr(bamfile,
                                          "chr4",
                                          "/mnt/software/lobSTR/",
                                          "TREDs",
                                          haploid=opts.haploid)
        stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats"
        results_dir = "lobstr_results"
        mkdir(results_dir)
        sh(cmd)
        sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file))
        return

    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile,
                                             chr,
                                             lhome,
                                             lbidx,
                                             haploid=opts.haploid)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Ejemplo n.º 38
0
Archivo: snp.py Proyecto: Hensonmw/jcvi
def gatk(args):
    """
    %prog gatk bamfile reference.fasta

    Call SNPs based on GATK best practices.
    """
    p = OptionParser(gatk.__doc__)
    p.add_option("--indelrealign", default=False, action="store_true",
                 help="Perform indel realignment")
    p.set_home("gatk")
    p.set_home("picard")
    p.set_phred()
    p.set_cpus(cpus=24)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, ref = args
    pf = bamfile.rsplit(".", 1)[0]
    mm = MakeManager()
    picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home)
    tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home)
    tk += " -R {0}".format(ref)

    # Step 0 - build reference
    dictfile = ref.rsplit(".", 1)[0] + ".dict"
    cmd1 = picard + " CreateSequenceDictionary"
    cmd1 += " R={0} O={1}".format(ref, dictfile)
    cmd2 = "samtools faidx {0}".format(ref)
    mm.add(ref, dictfile, (cmd1, cmd2))

    # Step 1 - sort bam
    sortedbamfile = pf + ".sorted.bam"
    cmd = picard + " SortSam"
    cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile)
    cmd += " SORT_ORDER=coordinate CREATE_INDEX=true"
    mm.add(bamfile, sortedbamfile, cmd)

    # Step 2 - mark duplicates
    dedupbamfile = pf + ".dedup.bam"
    cmd = picard + " MarkDuplicates"
    cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile)
    cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true"
    mm.add(sortedbamfile, dedupbamfile, cmd)

    if opts.indelrealign:
        # Step 3 - create indel realignment targets
        intervals = pf + ".intervals"
        cmd = tk + " -T RealignerTargetCreator"
        cmd += " -I {0} -o {1}".format(dedupbamfile, intervals)
        mm.add(dedupbamfile, intervals, cmd)

        # Step 4 - indel realignment
        realignedbamfile = pf + ".realigned.bam"
        cmd = tk + " -T IndelRealigner"
        cmd += " -targetIntervals {0}".format(intervals)
        cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile)
        mm.add((dictfile, intervals), realignedbamfile, cmd)
    else:
        realignedbamfile = dedupbamfile

    # Step 5 - SNP calling
    vcf = pf + ".vcf"
    cmd = tk + " -T HaplotypeCaller"
    cmd += " -I {0}".format(realignedbamfile)
    cmd += " --genotyping_mode DISCOVERY"
    cmd += " -stand_emit_conf 10 -stand_call_conf 30"
    cmd += " -nct {0}".format(opts.cpus)
    cmd += " -o {0}".format(vcf)
    if opts.phred == "64":
        cmd += " --fix_misencoded_quality_scores"
    mm.add(realignedbamfile, vcf, cmd)

    # Step 6 - SNP filtering
    filtered_vcf = pf + ".filtered.vcf"
    cmd = tk + " -T VariantFiltration"
    cmd += " -V {0}".format(vcf)
    cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"'
    cmd += ' --filterName "LOWQUAL"'
    cmd += ' --genotypeFilterExpression "isHomVar == 1"'
    cmd += ' --genotypeFilterName "HOMOVAR"'
    cmd += ' --genotypeFilterExpression "isHet == 1"'
    cmd += ' --genotypeFilterName "HET"'
    cmd += " -o {0}".format(filtered_vcf)
    mm.add(vcf, filtered_vcf, cmd)

    mm.write()
Ejemplo n.º 39
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()
Ejemplo n.º 40
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=95)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()
Ejemplo n.º 41
0
def trf(args):
    """
    %prog trf outdir

    Run TRF on FASTA files.
    """
    from jcvi.apps.base import iglob
    cparams = "1 1 2 80 5 200 2000"

    p = OptionParser(trf.__doc__)
    p.add_option("--mismatch",
                 default=31,
                 type="int",
                 help="Mismatch and gap penalty")
    p.add_option("--minscore",
                 default=MINSCORE,
                 type="int",
                 help="Minimum score to report")
    p.add_option("--period",
                 default=6,
                 type="int",
                 help="Maximum period to report")
    p.add_option("--lobstr",
                 default=False,
                 action="store_true",
                 help="Generate output for lobSTR")
    p.add_option("--telomeres",
                 default=False,
                 action="store_true",
                 help="Run telomere search: minscore=140 period=7")
    p.add_option("--centromeres",
                 default=False,
                 action="store_true",
                 help="Run centromere search: {}".format(cparams))
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    outdir, = args
    minlength = opts.minscore / 2
    mm = MakeManager()
    if opts.telomeres:
        opts.minscore, opts.period = 140, 7

    params = "2 {0} {0} 80 10 {1} {2}".\
            format(opts.mismatch, opts.minscore, opts.period).split()
    if opts.centromeres:
        params = cparams.split()

    bedfiles = []
    for fastafile in natsorted(iglob(outdir, "*.fa,*.fasta")):
        pf = op.basename(fastafile).rsplit(".", 1)[0]
        # Commands starting with trf ignores errors
        cmd1 = "-trf {0} {1} -d -h".format(fastafile, " ".join(params))
        datfile = op.basename(fastafile) + "." + ".".join(params) + ".dat"
        bedfile = "{0}.trf.bed".format(pf)
        cmd2 = "cat {} | grep -v ^Parameters".format(datfile)
        if opts.lobstr:
            cmd2 += " | awk '($8 >= {} && $8 <= {})'".\
                    format(minlength, READLEN - minlength)
        else:
            cmd2 += " | awk '($8 >= 0)'"
        cmd2 += " | sed 's/ /\\t/g'"
        cmd2 += " | awk '{{print \"{0}\\t\" $0}}' > {1}".format(pf, bedfile)
        mm.add(fastafile, datfile, cmd1)
        mm.add(datfile, bedfile, cmd2)
        bedfiles.append(bedfile)

    bedfile = "trf.bed"
    cmd = "cat {0} > {1}".format(" ".join(natsorted(bedfiles)), bedfile)
    mm.add(bedfiles, bedfile, cmd)

    mm.write()
Ejemplo n.º 42
0
def minimac(args):
    """
    %prog batchminimac input.txt

    Use MINIMAC3 to impute vcf on all chromosomes.
    """
    p = OptionParser(minimac.__doc__)
    p.set_home("shapeit")
    p.set_home("minimac")
    p.set_outfile()
    p.set_chr()
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    txtfile, = args
    ref = opts.ref
    mm = MakeManager()
    pf = txtfile.split(".")[0]
    allrawvcf = []
    alloutvcf = []
    chrs = opts.chr.split(",")
    for x in chrs:
        px = CM[x]
        chrvcf = pf + ".{0}.vcf".format(px)
        if txtfile.endswith(".vcf"):
            cmd = "vcftools --vcf {0} --chr {1}".format(txtfile, x)
            cmd += " --out {0}.{1} --recode".format(pf, px)
            cmd += " && mv {0}.{1}.recode.vcf {2}".format(pf, px, chrvcf)
        else:  # 23andme
            cmd = "python -m jcvi.formats.vcf from23andme {0} {1}".format(txtfile, x)
            cmd += " --ref {0}".format(ref)
        mm.add(txtfile, chrvcf, cmd)

        chrvcf_hg38 = pf + ".{0}.23andme.hg38.vcf".format(px)
        minimac_liftover(mm, chrvcf, chrvcf_hg38, opts)
        allrawvcf.append(chrvcf_hg38)

        minimacvcf = "{0}.{1}.minimac.dose.vcf".format(pf, px)
        if x == "X":
            minimac_X(mm, x, chrvcf, opts)
        elif x in ["Y", "MT"]:
            cmd = "python -m jcvi.variation.impute passthrough"
            cmd += " {0} {1}".format(chrvcf, minimacvcf)
            mm.add(chrvcf, minimacvcf, cmd)
        else:
            minimac_autosome(mm, x, chrvcf, opts)

        # keep the best line for multi-allelic markers
        uniqvcf= "{0}.{1}.minimac.uniq.vcf".format(pf, px)
        cmd = "python -m jcvi.formats.vcf uniq {0} > {1}".\
                    format(minimacvcf, uniqvcf)
        mm.add(minimacvcf, uniqvcf, cmd)

        minimacvcf_hg38 = "{0}.{1}.minimac.hg38.vcf".format(pf, px)
        minimac_liftover(mm, uniqvcf, minimacvcf_hg38, opts)
        alloutvcf.append(minimacvcf_hg38)

    if len(allrawvcf) > 1:
        rawhg38vcfgz = pf + ".all.23andme.hg38.vcf.gz"
        cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(allrawvcf), rawhg38vcfgz)
        mm.add(allrawvcf, rawhg38vcfgz, cmd)

    if len(alloutvcf) > 1:
        outhg38vcfgz = pf + ".all.minimac.hg38.vcf.gz"
        cmd = "vcf-concat {0} | bgzip > {1}".format(" ".join(alloutvcf), outhg38vcfgz)
        mm.add(alloutvcf, outhg38vcfgz, cmd)

    mm.write()
Ejemplo n.º 43
0
def gatk(args):
    """
    %prog gatk bamfile reference.fasta

    Call SNPs based on GATK best practices.
    """
    p = OptionParser(gatk.__doc__)
    p.add_option("--indelrealign", default=False, action="store_true",
                 help="Perform indel realignment")
    p.set_home("gatk")
    p.set_home("picard")
    p.set_phred()
    p.set_cpus(cpus=24)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, ref = args
    pf = bamfile.rsplit(".", 1)[0]
    mm = MakeManager()
    picard = "java -Xmx32g -jar {0}/picard.jar".format(opts.picard_home)
    tk = "java -Xmx32g -jar {0}/GenomeAnalysisTK.jar".format(opts.gatk_home)
    tk += " -R {0}".format(ref)

    # Step 0 - build reference
    dictfile = ref.rsplit(".", 1)[0] + ".dict"
    cmd1 = picard + " CreateSequenceDictionary"
    cmd1 += " R={0} O={1}".format(ref, dictfile)
    cmd2 = "samtools faidx {0}".format(ref)
    mm.add(ref, dictfile, (cmd1, cmd2))

    # Step 1 - sort bam
    sortedbamfile = pf + ".sorted.bam"
    cmd = picard + " SortSam"
    cmd += " INPUT={0} OUTPUT={1}".format(bamfile, sortedbamfile)
    cmd += " SORT_ORDER=coordinate CREATE_INDEX=true"
    mm.add(bamfile, sortedbamfile, cmd)

    # Step 2 - mark duplicates
    dedupbamfile = pf + ".dedup.bam"
    cmd = picard + " MarkDuplicates"
    cmd += " INPUT={0} OUTPUT={1}".format(sortedbamfile, dedupbamfile)
    cmd += " METRICS_FILE=dedup.log CREATE_INDEX=true"
    mm.add(sortedbamfile, dedupbamfile, cmd)

    if opts.indelrealign:
        # Step 3 - create indel realignment targets
        intervals = pf + ".intervals"
        cmd = tk + " -T RealignerTargetCreator"
        cmd += " -I {0} -o {1}".format(dedupbamfile, intervals)
        mm.add(dedupbamfile, intervals, cmd)

        # Step 4 - indel realignment
        realignedbamfile = pf + ".realigned.bam"
        cmd = tk + " -T IndelRealigner"
        cmd += " -targetIntervals {0}".format(intervals)
        cmd += " -I {0} -o {1}".format(dedupbamfile, realignedbamfile)
        mm.add((dictfile, intervals), realignedbamfile, cmd)
    else:
        realignedbamfile = dedupbamfile

    # Step 5 - SNP calling
    vcf = pf + ".vcf"
    cmd = tk + " -T HaplotypeCaller"
    cmd += " -I {0}".format(realignedbamfile)
    cmd += " --genotyping_mode DISCOVERY"
    cmd += " -stand_emit_conf 10 -stand_call_conf 30"
    cmd += " -nct {0}".format(opts.cpus)
    cmd += " -o {0}".format(vcf)
    if opts.phred == "64":
        cmd += " --fix_misencoded_quality_scores"
    mm.add(realignedbamfile, vcf, cmd)

    # Step 6 - SNP filtering
    filtered_vcf = pf + ".filtered.vcf"
    cmd = tk + " -T VariantFiltration"
    cmd += " -V {0}".format(vcf)
    cmd += ' --filterExpression "DP < 10 || DP > 300 || QD < 2.0 || FS > 60.0 || MQ < 40.0"'
    cmd += ' --filterName "LOWQUAL"'
    cmd += ' --genotypeFilterExpression "isHomVar == 1"'
    cmd += ' --genotypeFilterName "HOMOVAR"'
    cmd += ' --genotypeFilterExpression "isHet == 1"'
    cmd += ' --genotypeFilterName "HET"'
    cmd += " -o {0}".format(filtered_vcf)
    mm.add(vcf, filtered_vcf, cmd)

    mm.write()
Ejemplo n.º 44
0
def mappability(args):
    """
    %prog mappability reference.fasta

    Generate 50mer mappability for reference genome. Commands are based on gem
    mapper. See instructions:
    <https://github.com/xuefzhao/Reference.Mappability>
    """
    p = OptionParser(mappability.__doc__)
    p.add_option("--mer", default=50, type="int", help="User mer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ref, = args
    K = opts.mer
    pf = ref.rsplit(".", 1)[0]
    mm = MakeManager()

    gem = pf + ".gem"
    cmd = "gem-indexer -i {} -o {}".format(ref, pf)
    mm.add(ref, gem, cmd)

    mer = pf + ".{}mer".format(K)
    mapb = mer + ".mappability"
    cmd = "gem-mappability -I {} -l {} -o {} -T {}".\
                format(gem, K, mer, opts.cpus)
    mm.add(gem, mapb, cmd)

    wig = mer + ".wig"
    cmd = "gem-2-wig -I {} -i {} -o {}".format(gem, mapb, mer)
    mm.add(mapb, wig, cmd)

    bw = mer + ".bw"
    cmd = "wigToBigWig {} {}.sizes {}".format(wig, mer, bw)
    mm.add(wig, bw, cmd)

    bg = mer + ".bedGraph"
    cmd = "bigWigToBedGraph {} {}".format(bw, bg)
    mm.add(bw, bg, cmd)

    merged = mer + ".filtered-1.merge.bed"
    cmd = "python -m jcvi.formats.bed filterbedgraph {} 1".format(bg)
    mm.add(bg, merged, cmd)

    mm.write()
Ejemplo n.º 45
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Ejemplo n.º 46
0
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")
Ejemplo n.º 47
0
Archivo: str.py Proyecto: Hensonmw/jcvi
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")
Ejemplo n.º 48
0
def novo2(args):
    """
    %prog novo2 trimmed projectname

    Reference-free tGBS pipeline v2.
    """
    p = OptionParser(novo2.__doc__)
    p.set_fastq_names()
    p.set_align(pctid=94)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, pf = args
    pctid = opts.pctid
    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    clustdir = "uclust"
    acdir ="allele_counts"
    for d in (clustdir, acdir):
        mkdir(d)

    mm = MakeManager()
    clustfiles = []
    # Step 0 - clustering within sample
    for s in samples:
        flist = [x for x in reads if op.basename(x).split(".")[0] == s]
        outfile = s + ".P{0}.clustS".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust cluster --cpus=8"
        cmd += " {0} {1}".format(s, " ".join(flist))
        cmd += " --outdir={0}".format(clustdir)
        cmd += " --pctid={0}".format(pctid)
        mm.add(flist, outfile, cmd)
        clustfiles.append(outfile)

    # Step 1 - make consensus within sample
    allcons = []
    for s, clustfile in zip(samples, clustfiles):
        outfile = s + ".P{0}.consensus".format(pctid)
        outfile = op.join(clustdir, outfile)
        cmd = "python -m jcvi.apps.uclust consensus"
        cmd += " {0}".format(clustfile)
        mm.add(clustfile, outfile, cmd)
        allcons.append(outfile)

    # Step 2 - clustering across samples
    clustSfile = pf + ".P{0}.clustS".format(pctid)
    cmd = "python -m jcvi.apps.uclust mcluster {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons, clustSfile, cmd)

    # Step 3 - make consensus across samples
    locifile = pf + ".P{0}.loci".format(pctid)
    cmd = "python -m jcvi.apps.uclust mconsensus {0}".format(" ".join(allcons))
    cmd += " --prefix={0}".format(pf)
    mm.add(allcons + [clustSfile], locifile, cmd)

    mm.write()
Ejemplo n.º 49
0
def snpflow(args):
    """
    %prog snpflow trimmed reference.fasta

    Run SNP calling pipeline until allele_counts are generated. This includes
    generation of native files, SNP_Het file. Speedup for fragmented genomes
    are also supported.
    """
    p = OptionParser(snpflow.__doc__)
    p.set_fastq_names()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trimmed, ref = args
    nseqs = len(Fasta(ref))
    supercat = nseqs >= 1000
    if supercat:
        logging.debug("Total seqs in ref: {0} (supercat={1})".\
                      format(nseqs, supercat))

    reads, samples = scan_read_files(trimmed, opts.names)

    # Set up directory structure
    nativedir, countsdir = "native", "allele_counts"
    for d in (nativedir, countsdir):
        mkdir(d)

    mm = MakeManager()
    # Step 0 - index database
    db = op.join(*check_index(ref, supercat=supercat, go=False))
    cmd = "python -m jcvi.apps.gmap index {0}".format(ref)
    if supercat:
        cmd += " --supercat"
        coordsfile = db + ".coords"
        supercatfile = ref.rsplit(".", 1)[0] + ".supercat.fasta"
        mm.add(ref, (db, coordsfile), cmd)
    else:
        mm.add(ref, db, cmd)

    # Step 1 - GSNAP alignment and conversion to native file
    allnatives = []
    allsamstats = []
    gmapdb = supercatfile if supercat else ref
    for f in reads:
        prefix = get_prefix(f, ref)
        gsnapfile = op.join(nativedir, prefix + ".gsnap")
        nativefile = op.join(nativedir, prefix + ".unique.native")
        samstatsfile = op.join(nativedir, prefix + ".unique.sam.stats")
        cmd = "python -m jcvi.apps.gmap align {0} {1}".format(gmapdb, f)
        cmd += " --outdir={0} --native --cpus=1".format(nativedir)
        mm.add((f, db), nativefile, cmd)

        cmd = "python -m jcvi.apps.gmap bam {0} {1} --cpus=1".\
                format(gsnapfile, gmapdb)
        mm.add(nativefile, samstatsfile, cmd)
        allnatives.append(nativefile)
        allsamstats.append(samstatsfile)

    # Step 2 - call SNP discovery
    if supercat:
        nativeconverted = nativedir + "-converted"
        mkdir(nativeconverted)
        allnativesc = [op.join(nativeconverted, op.basename(x)) for x in allnatives]
        cmd = "tGBS-Convert_Pseudo_Genome_NATIVE_Coordinates.pl"
        cmd += " -i {0}/*.native -o {1}".format(nativedir, nativeconverted)
        cmd += " -c {0}".format(coordsfile)
        cmds = ["rm -rf {0}".format(nativeconverted), cmd]
        mm.add(allnatives + [coordsfile], allnativesc, cmds)

        runfile = "speedup.sh"
        write_file(runfile, speedupsh.format(nativeconverted, opts.cpus))
        nativedir = nativeconverted
        allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
        mm.add(allnativesc, allsnps, "./{0}".format(runfile))
    else:
        for s in samples:
            snpfile = op.join(nativedir, "{0}.SNPs_Het.txt".format(s))
            cmd = "SNP_Discovery-short.pl"
            cmd += " -native {0}/{1}.*unique.native".format(nativedir, s)
            cmd += " -o {0} -a 2 -ac 0.3 -c 0.8".format(snpfile)
            flist = [x for x in allnatives if op.basename(x).split(".")[0] == s]
            mm.add(flist, snpfile, cmd)

    # Step 3 - generate equal file
    allsnps = [op.join(nativedir, "{0}.SNPs_Het.txt".format(x)) for x in samples]
    for s in samples:
        equalfile = op.join(nativedir, "{0}.equal".format(s))
        cmd = "extract_reference_alleles.pl"
        cmd += " --native {0}/{1}.*unique.native".format(nativedir, s)
        cmd += " --genotype {0}/{1}.SNPs_Het.txt".format(nativedir, s)
        cmd += " --allgenotypes {0}/*.SNPs_Het.txt".format(nativedir)
        cmd += " --fasta {0} --output {1}".format(ref, equalfile)
        mm.add(allsnps, equalfile, cmd)

    # Step 4 - generate snp matrix
    allequals = [op.join(nativedir, "{0}.equal".format(x)) for x in samples]
    matrix = "snps.matrix.txt"
    cmd = "generate_matrix.pl"
    cmd += " --tables {0}/*SNPs_Het.txt --equal {0}/*equal".format(nativedir)
    cmd += " --fasta {0} --output {1}".format(ref, matrix)
    mm.add(allsnps + allequals, matrix, cmd)

    # Step 5 - generate allele counts
    allcounts = []
    for s in samples:
        allele_counts = op.join(countsdir, "{0}.SNPs_Het.allele_counts".format(s))
        cmd = "count_reads_per_allele.pl -m snps.matrix.txt"
        cmd += " -s {0} --native {1}/{0}.*unique.native".format(s, nativedir)
        cmd += " -o {0}".format(allele_counts)
        mm.add(matrix, allele_counts, cmd)
        allcounts.append(allele_counts)

    # Step 6 - generate raw snps
    rawsnps = "Genotyping.H3.txt"
    cmd = "/home/shared/scripts/delin/SamplesGenotyping.pl --h**o 3"
    cmd += " -pf allele_counts -f {0} --outfile {1}".format(countsdir, rawsnps)
    cmds = ["rm -f {0}".format(rawsnps), cmd]
    mm.add(allcounts, rawsnps, cmds)

    # Step 7 - generate alignment report
    sam_summary = "sam.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -f {0} -o {1}".format(" ".join(allsamstats), sam_summary)
    mm.add(allsamstats, sam_summary, cmd)

    native_summary = "native.summary"
    cmd = "/home/shared/scripts/eddyyeh/alignment_stats.pl"
    cmd += " -n {0} -o {1}".format(" ".join(allnatives), native_summary)
    mm.add(allnatives, native_summary, cmd)

    mm.write()