Example #1
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--fast",
                 default=False,
                 action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus",
                 default=False,
                 action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads",
                 default=False,
                 action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand",
                 default=False,
                 action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Example #2
0
File: str.py Project: qiao-xin/jcvi
def htt(args):
    """
    %prog htt bamfile chr4:3070000-3080000

    Extract HTT region and run lobSTR.
    """
    p = OptionParser(htt.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, region = args
    lhome = opts.lobstr_home

    minibamfile = bamfile.split("/")[-1]
    baifile = minibamfile + ".bai"
    if op.exists(baifile):
        sh("rm {}".format(baifile))
    cmd = "samtools view {} {} -b".format(bamfile, region)
    cmd += " -o {0}".format(minibamfile)
    sh(cmd)

    sh("samtools index {0}".format(minibamfile))

    c = region.split(":")[0].replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "hg38")
    sh(cmd)
Example #3
0
File: cdhit.py Project: rrane/jcvi
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment [%default: %default]")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Example #4
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Example #5
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option(
        "--notreds",
        default=False,
        action="store_true",
        help="Remove TREDs from the bed file",
    )
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print(r, file=newbed)
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Example #6
0
def locus(args):
    """
    %prog locus bamfile

    Extract selected locus from a list of TREDs for validation, and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam

    # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation
    INCLUDE = [
        "HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2", "FXTAS"
    ]
    db_choices = ("hg38", "hg19")

    p = OptionParser(locus.__doc__)
    p.add_option("--tred", choices=INCLUDE, help="TRED name")
    p.add_option("--ref",
                 choices=db_choices,
                 default="hg38",
                 help="Reference genome")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (bamfile, ) = args
    ref = opts.ref
    lhome = opts.lobstr_home
    tred = opts.tred

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile, index_col=0)
    row = tf.ix[tred]
    tag = "repeat_location"
    ldb = "TREDs"
    if ref == "hg19":
        tag += "." + ref
        ldb += "-" + ref
    seqid, start_end = row[tag].split(":")

    PAD = 1000
    start, end = start_end.split("-")
    start, end = int(start) - PAD, int(end) + PAD
    region = "{}:{}-{}".format(seqid, start, end)

    minibamfile = get_minibam(bamfile, region)
    c = seqid.replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb)
    sh(cmd)

    parser = LobSTRvcf(columnidsfile=None)
    parser.parse(vcf, filtered=False)
    items = parser.items()
    if not items:
        print("No entry found!", file=sys.stderr)
        return

    k, v = parser.items()[0]
    print("{} => {}".format(tred, v.replace(",", "/")), file=sys.stderr)
Example #7
0
def beagle(args):
    """
    %prog beagle input.vcf 1

    Use BEAGLE4.1 to impute vcf on chromosome 1.
    """
    p = OptionParser(beagle.__doc__)
    p.set_home("beagle")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, chr = args
    pf = vcffile.rsplit(".", 1)[0]
    outpf = pf + ".beagle"
    outfile = outpf + ".vcf.gz"

    mm = MakeManager()
    beagle_cmd = opts.beagle_home
    kg = op.join(opts.ref, "1000GP_Phase3")
    cmd = beagle_cmd + " gt={0}".format(vcffile)
    cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr)
    cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr)
    cmd += " out={0}".format(outpf)
    cmd += " nthreads=16 gprobs=true"
    mm.add(vcffile, outfile, cmd)

    mm.write()
Example #8
0
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Example #9
0
File: train.py Project: rrane/jcvi
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.set_home("gmes")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    mhome = opts.gmes_home
    gmdir = "genemark"
    mkdir(gmdir)

    cwd = os.getcwd()
    os.chdir(gmdir)
    cmd = "ln -sf ../{0}".format(fastafile)
    sh(cmd)

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile)
    sh(cmd)

    os.chdir(cwd)
    logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(gmdir, species))
Example #10
0
File: tgbs.py Project: fw1121/jcvi
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Example #11
0
File: tgbs.py Project: fw1121/jcvi
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Example #12
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #13
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Example #14
0
def htt(args):
    """
    %prog htt bamfile

    Extract HTT region and run lobSTR.
    """
    p = OptionParser(htt.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    lhome = opts.lobstr_home

    minibamfile = bamfile.split("/")[-1]
    cmd = "samtools view {0} chr4:3070000-3080000 -b".format(bamfile)
    cmd += " -o {0}".format(minibamfile)
    sh(cmd)

    sh("rm {0}.bai".format(minibamfile))
    sh("samtools index {0}".format(minibamfile))

    cmd = allelotype_on_chr(minibamfile, 4, lhome, "hg38-named")
    sh(cmd)
Example #15
0
File: str.py Project: Hensonmw/jcvi
def htt(args):
    """
    %prog htt bamfile

    Extract HTT region and run lobSTR.
    """
    p = OptionParser(htt.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    lhome = opts.lobstr_home

    minibamfile = bamfile.split("/")[-1]
    cmd = "samtools view {0} chr4:3070000-3080000 -b".format(bamfile)
    cmd += " -o {0}".format(minibamfile)
    sh(cmd)

    sh("rm {0}.bai".format(minibamfile))
    sh("samtools index {0}".format(minibamfile))

    cmd = allelotype_on_chr(minibamfile, 4, lhome, "hg38-named")
    sh(cmd)
Example #16
0
File: train.py Project: rrane/jcvi
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(snap.__doc__)
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)

    sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species))
    sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".format(mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(mhome, species))
    sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst")
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(mhome))
    sh("grep -c LOCUS raw.gb training.gb")
    sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0}/species/{1} augustus/".format(mhome, species))
Example #17
0
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Example #18
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.set_home("gmes")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    mhome = opts.gmes_home
    gmdir = "genemark"
    mkdir(gmdir)

    cwd = os.getcwd()
    os.chdir(gmdir)
    cmd = "ln -sf ../{0}".format(fastafile)
    sh(cmd)

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gm_es.pl {1}".format(mhome, fastafile)
    sh(cmd)

    os.chdir(cwd)
    logging.debug("GENEMARK matrix written to `{0}/mod/{1}.mod`".format(
        gmdir, species))
Example #19
0
def impute(args):
    """
    %prog impute input.vcf hs37d5.fa 1

    Use IMPUTE2 to impute vcf on chromosome 1.
    """
    from pyfaidx import Fasta

    p = OptionParser(impute.__doc__)
    p.set_home("shapeit")
    p.set_home("impute")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    vcffile, fastafile, chr = args
    mm = MakeManager()
    pf = vcffile.rsplit(".", 1)[0]
    hapsfile = pf + ".haps"
    kg = op.join(opts.ref, "1000GP_Phase3")
    shapeit_phasing(mm, chr, vcffile, opts)

    fasta = Fasta(fastafile)
    size = len(fasta[chr])
    binsize = 5000000
    bins = size / binsize  # 5Mb bins
    if size % binsize:
        bins += 1
    impute_cmd = op.join(opts.impute_home, "impute2")
    chunks = []
    for x in xrange(bins + 1):
        chunk_start = x * binsize + 1
        chunk_end = min(chunk_start + binsize - 1, size)
        outfile = pf + ".chunk{0:02d}.impute2".format(x)
        mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr)
        rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr)
        cmd = impute_cmd + " -m {0}".format(mapfile)
        cmd += " -known_haps_g {0}".format(hapsfile)
        cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf)
        cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end)
        cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile)
        cmd += " && touch {0}".format(outfile)
        mm.add(hapsfile, outfile, cmd)
        chunks.append(outfile)

    # Combine all the files
    imputefile = pf + ".impute2"
    cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile)
    mm.add(chunks, imputefile, cmd)

    # Convert to vcf
    vcffile = pf + ".impute2.vcf"
    cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\
                format(imputefile, fastafile, chr, vcffile)
    mm.add(imputefile, vcffile, cmd)
    mm.write()
Example #20
0
def mito(args):
    """
    %prog mito chrM.fa input.bam

    Identify mitochondrial deletions.
    """
    p = OptionParser(mito.__doc__)
    p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions")
    p.add_option("--realignonly", default=False, action="store_true",
                 help="Realign only")
    p.add_option("--svonly", default=False, action="store_true",
                 help="Run Realign => SV calls only")
    p.add_option("--support", default=1, type="int",
                 help="Minimum number of supporting reads")
    p.set_home("speedseq", default="/mnt/software/speedseq/bin")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    chrMfa, bamfile = args
    store = opts.output_path
    cleanup = not opts.nocleanup

    if not op.exists(chrMfa):
        logging.debug("File `{}` missing. Exiting.".format(chrMfa))
        return

    chrMfai = chrMfa + ".fai"
    if not op.exists(chrMfai):
        cmd = "samtools index {}".format(chrMfa)
        sh(cmd)

    if not bamfile.endswith(".bam"):
        bamfiles = [x.strip() for x in open(bamfile)]
    else:
        bamfiles = [bamfile]

    if store:
        computed = ls_s3(store)
        computed = [op.basename(x).split('.')[0] for x in computed if \
                        x.endswith(".depth")]
        remaining_samples = [x for x in bamfiles \
                    if op.basename(x).split(".")[0] not in computed]

        logging.debug("Already computed on `{}`: {}".\
                        format(store, len(bamfiles) - len(remaining_samples)))
        bamfiles = remaining_samples

    logging.debug("Total samples: {}".format(len(bamfiles)))

    for bamfile in bamfiles:
        run_mito(chrMfa, bamfile, opts,
                 realignonly=opts.realignonly,
                 svonly=opts.svonly,
                 store=store, cleanup=cleanup)
Example #21
0
def locus(args):
    """
    %prog locus bamfile

    Extract selected locus from a list of TREDs for validation, and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam
    # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation
    INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2",
               "FXTAS"]
    db_choices = ("hg38", "hg19")

    p = OptionParser(locus.__doc__)
    p.add_option("--tred", choices=INCLUDE,
                 help="TRED name")
    p.add_option("--ref", choices=db_choices, default="hg38",
                 help="Reference genome")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    ref = opts.ref
    lhome = opts.lobstr_home
    tred = opts.tred

    tredsfile = datafile("TREDs.meta.csv")
    tf = pd.read_csv(tredsfile, index_col=0)
    row = tf.ix[tred]
    tag = "repeat_location"
    ldb = "TREDs"
    if ref == "hg19":
        tag += "." + ref
        ldb += "-" + ref
    seqid, start_end = row[tag].split(":")

    PAD = 1000
    start, end = start_end.split('-')
    start, end = int(start) - PAD, int(end) + PAD
    region = "{}:{}-{}".format(seqid, start, end)

    minibamfile = get_minibam(bamfile, region)
    c = seqid.replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, ldb)
    sh(cmd)

    parser = LobSTRvcf(columnidsfile=None)
    parser.parse(vcf, filtered=False)
    items = parser.items()
    if not items:
        print("No entry found!", file=sys.stderr)
        return

    k, v = parser.items()[0]
    print("{} => {}".format(tred, v.replace(',', '/')), file=sys.stderr)
Example #22
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--notreds", default=False, action="store_true",
                 help="Remove TREDs from the bed file")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    trfbed, fastafile = args
    pf = fastafile.split(".")[0]
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.notreds:
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        seen = set()
        for row in fp:
            r = STRLine(row)
            total += 1
            name = r.longname
            if name in seen:
                continue
            seen.add(name)
            print >> newbed, r
            retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(newbedfile, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Example #23
0
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(augustus.__doc__)
    p.add_option(
        "--autotrain",
        default=False,
        action="store_true",
        help="Run autoAugTrain.pl to iteratively train AUGUSTUS",
    )
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    gffile = os.path.abspath(gffile)
    fastafile = os.path.abspath(fastafile)
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)
    target = "{0}/config/species/{1}".format(mhome, species)

    if op.exists(target):
        logging.debug("Removing existing target `{0}`".format(target))
        sh("rm -rf {0}".format(target))

    config_path = "{0}/config".format(mhome)
    sh("{0}/scripts/new_species.pl --species={1} --AUGUSTUS_CONFIG_PATH={2}".
       format(mhome, species, config_path))
    sh("{0}/scripts/gff2gbSmallDNA.pl {1} {2} 1000 raw.gb".format(
        mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".format(
        mhome, species))
    sh(r"cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst"
       )
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".format(
        mhome))
    sh("grep -c LOCUS raw.gb training.gb")

    # autoAugTrain failed to execute, disable for now
    if opts.autotrain:
        sh("rm -rf {0}".format(target))
        sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}"
           .format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0} augustus/".format(target))
Example #24
0
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq",
                 action="store_true",
                 default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Example #25
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option("--fast", default=False, action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus", default=False, action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Example #26
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option(
        "--nofilter",
        default=False,
        action="store_true",
        help="Do not filter the variants",
    )
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samples, ) = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print("\n".join(uids), file=fw)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for _ in p.map_async(run_compile, run_args).get():
        continue
Example #27
0
def batchlobstr(args):
    """
    %prog batchlobstr samples.csv

    Run lobSTR sequentially on list of samples. Each line contains:
    sample-name,s3-location
    """
    p = OptionParser(batchlobstr.__doc__)
    p.add_option("--sep", default=",", help="Separator for building commandline")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (samplesfile,) = args
    store = opts.output_path
    computed = ls_s3(store)
    fp = open(samplesfile)
    skipped = total = 0
    for row in fp:
        total += 1
        sample, s3file = row.strip().split(",")[:2]
        exec_id, sample_id = sample.split("_")
        bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam")

        gzfile = sample + ".{0}.vcf.gz".format("hg38")
        if gzfile in computed:
            skipped += 1
            continue

        print(
            opts.sep.join(
                "python -m jcvi.variation.str lobstr".split()
                + [
                    "hg38",
                    "--input_bam_path",
                    bamfile,
                    "--output_path",
                    store,
                    "--sample_id",
                    sample_id,
                    "--workflow_execution_id",
                    exec_id,
                    "--lobstr_home",
                    opts.lobstr_home,
                    "--workdir",
                    opts.workdir,
                ]
            )
        )
    fp.close()
    logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
Example #28
0
File: str.py Project: Hensonmw/jcvi
def lobstrindex(args):
    """
    %prog lobstrindex hg38.trf.bed hg38.upper.fa hg38

    Make lobSTR index. Make sure the FASTA contain only upper case (so use
    fasta.format --upper to convert from UCSC fasta). The bed file is generated
    by str().
    """
    p = OptionParser(lobstrindex.__doc__)
    p.add_option("--fixseq", action="store_true", default=False,
                 help="Scan sequences to extract perfect STRs")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    trfbed, fastafile, pf = args
    lhome = opts.lobstr_home
    mkdir(pf)

    if opts.fixseq:
        genome = pyfasta.Fasta(fastafile)
        newbedfile = trfbed + ".new"
        newbed = open(newbedfile, "w")
        fp = open(trfbed)
        retained = total = 0
        for row in fp:
            s = STRLine(row)
            total += 1
            for ns in s.iter_exact_str(genome):
                if not ns.is_valid():
                    continue
                print >> newbed, ns
                retained += 1
        newbed.close()
        logging.debug("Retained: {0}".format(percentage(retained, total)))
    else:
        newbedfile = trfbed

    mm = MakeManager()
    cmd = "python {0}/scripts/lobstr_index.py".format(lhome)
    cmd += " --str {0} --ref {1} --out {2}".format(newbedfile, fastafile, pf)
    mm.add((newbedfile, fastafile), op.join(pf, "lobSTR_ref.fasta.rsa"), cmd)

    tabfile = "{0}/index.tab".format(pf)
    cmd = "python {0}/scripts/GetSTRInfo.py".format(lhome)
    cmd += " {0} {1} > {2}".format(newbedfile, fastafile, tabfile)
    mm.add((newbedfile, fastafile), tabfile, cmd)

    infofile = "{0}/index.info".format(pf)
    cmd = "cp {0} {1}".format(trfbed, infofile)
    mm.add(trfbed, infofile, cmd)
    mm.write()
Example #29
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species",
                 default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3",
                 default=False,
                 action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastafile, ) = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(
        augustuswrap,
        species=opts.species,
        gff3=gff3,
        cfgfile=cfgfile,
        hintsfile=opts.hintsfile,
    )
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus

        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Example #30
0
def pasa(args):
    """
    %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3

    Wraps `pasa_asmbls_to_training_set.dbi`.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.gff import Gff

    p = OptionParser(pasa.__doc__)
    p.set_home("pasa")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, gffile = args
    transcodergff = fastafile + ".transdecoder.gff3"
    transcodergenomegff = fastafile + ".transdecoder.genome.gff3"
    if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)):
        cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(
            opts.pasa_home)
        cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\
                format(fastafile, gffile)
        sh(cmd)

    completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids"
    if need_update(transcodergff, completeids):
        cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff)
        sh(cmd, outfile=completeids)

    complete = SetFile(completeids)
    seen = set()
    completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3"
    fw = open(completegff, "w")
    gff = Gff(transcodergenomegff)
    for g in gff:
        a = g.attributes
        if "Parent" in a:
            id = a["Parent"][0]
        else:
            id = a["ID"][0]
        asmbl_id = id.split("|")[0]
        if asmbl_id not in complete:
            continue
        print >> fw, g
        if g.type == "gene":
            seen.add(id)

    fw.close()
    logging.debug("A total of {0} complete models extracted to `{1}`.".\
                    format(len(seen), completegff))
Example #31
0
def batchlobstr(args):
    """
    %prog batchlobstr samples.csv

    Run lobSTR sequentially on list of samples. Each line contains:
    sample-name,s3-location
    """
    p = OptionParser(batchlobstr.__doc__)
    p.add_option("--sep", default=",", help="Separator for building commandline")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samplesfile, = args
    store = opts.output_path
    computed = ls_s3(store)
    fp = open(samplesfile)
    skipped = total = 0
    for row in fp:
        total += 1
        sample, s3file = row.strip().split(",")[:2]
        exec_id, sample_id = sample.split("_")
        bamfile = s3file.replace(".gz", "").replace(".vcf", ".bam")

        gzfile = sample + ".{0}.vcf.gz".format("hg38")
        if gzfile in computed:
            skipped += 1
            continue

        print opts.sep.join(
            "python -m jcvi.variation.str lobstr".split()
            + [
                "hg38",
                "--input_bam_path",
                bamfile,
                "--output_path",
                store,
                "--sample_id",
                sample_id,
                "--workflow_execution_id",
                exec_id,
                "--lobstr_home",
                opts.lobstr_home,
                "--workdir",
                opts.workdir,
            ]
        )
    fp.close()
    logging.debug("Total skipped: {0}".format(percentage(skipped, total)))
Example #32
0
def dn(args):
    """
    %prog dn folder

    Run Trinity-DN on a folder of reads. When paired-end (--paired) mode is on,
    filenames will be scanned based on whether they contain "_1_" and "_2_".
    """
    p = OptionParser(dn.__doc__)
    p.add_option("--paired", default=False, action="store_true",
                 help="Paired-end mode [default: %default]")
    p.set_home("trinity")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    paired = opts.paired
    thome = opts.trinity_home
    tfolder = folder + "_DN"

    cwd = os.getcwd()
    mkdir(tfolder)
    os.chdir(tfolder)

    flist = glob("../" + folder + "/*")
    if paired:
        f1 = [x for x in flist if "_1_" in x or ".1." in x]
        f2 = [x for x in flist if "_2_" in x or ".2." in x]
        assert len(f1) == len(f2)
        r1, r2 = "left.fastq", "right.fastq"
        reads = ((f1, r1), (f2, r2))
    else:
        r = "single.fastq"
        reads = ((flist, r), )

    for fl, r in reads:
        fm = FileMerger(fl, r)
        fm.merge(checkexists=True)

    cmd = op.join(thome, "Trinity.pl")
    cmd += " --seqType fq --JM 100G --CPU {0}".format(opts.cpus)
    if paired:
        cmd += " --left {0} --right {1}".format(reads[0][-1], reads[1][-1])
    else:
        cmd += " --single {0}".format(reads[0][-1])

    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
    os.chdir(cwd)
Example #33
0
File: str.py Project: ascendo/jcvi
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    from multiprocessing import Pool

    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38,hg38-named",
                 help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    dbs = opts.db.split(",")
    mkdir(workdir)
    os.chdir(workdir)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

        # Generate two alleles
        dipuids = []
        for uid in uids:
            dipuids.extend([uid + ".1", uid + ".2"])
        fw = open("header.ids", "w")
        print >> fw, ",".join(dipuids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, opts.store, opts.cleanup) for x in vcffiles]
    #run(run_args[0])
    for res in p.map_async(run, run_args).get():
        continue
Example #34
0
def pasa(args):
    """
    %prog ${pasadb}.assemblies.fasta ${pasadb}.pasa_assemblies.gff3

    Wraps `pasa_asmbls_to_training_set.dbi`.
    """
    from jcvi.formats.base import SetFile
    from jcvi.formats.gff import Gff

    p = OptionParser(pasa.__doc__)
    p.set_home("pasa")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    fastafile, gffile = args
    transcodergff = fastafile + ".transdecoder.gff3"
    transcodergenomegff = fastafile + ".transdecoder.genome.gff3"
    if need_update((fastafile, gffile), (transcodergff, transcodergenomegff)):
        cmd = "{0}/scripts/pasa_asmbls_to_training_set.dbi".format(opts.pasa_home)
        cmd += " --pasa_transcripts_fasta {0} --pasa_transcripts_gff3 {1}".\
                format(fastafile, gffile)
        sh(cmd)

    completeids = fastafile.rsplit(".", 1)[0] + ".complete.ids"
    if need_update(transcodergff, completeids):
        cmd = "grep complete {0} | cut -f1 | sort -u".format(transcodergff)
        sh(cmd, outfile=completeids)

    complete = SetFile(completeids)
    seen = set()
    completegff = transcodergenomegff.rsplit(".", 1)[0] + ".complete.gff3"
    fw = open(completegff, "w")
    gff = Gff(transcodergenomegff)
    for g in gff:
        a = g.attributes
        if "Parent" in a:
            id = a["Parent"][0]
        else:
            id = a["ID"][0]
        asmbl_id = id.split("|")[0]
        if asmbl_id not in complete:
            continue
        print >> fw, g
        if g.type == "gene":
            seen.add(id)

    fw.close()
    logging.debug("A total of {0} complete models extracted to `{1}`.".\
                    format(len(seen), completegff))
Example #35
0
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option("--nofilter", default=False, action="store_true",
                 help="Do not filter the variants")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    run_args = [(x, filtered, cleanup, store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_compile, run_args).get():
        continue
Example #36
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #37
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #38
0
def augustus(args):
    """
    %prog augustus species gffile fastafile

    Train AUGUSTUS model given gffile and fastafile. Whole procedure taken from:
    <http://www.molecularevolution.org/molevolfiles/exercises/augustus/training.html>
    """
    p = OptionParser(snap.__doc__)
    p.add_option("--autotrain", default=False, action="store_true",
                 help="Run autoAugTrain.pl to iteratively train AUGUSTUS")
    p.set_home("augustus")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.augustus_home
    augdir = "augustus"

    cwd = os.getcwd()
    mkdir(augdir)
    os.chdir(augdir)
    target = "{0}/config/species/{1}".format(mhome, species)

    if op.exists(target):
        logging.debug("Removing existing target `{0}`".format(target))
        sh("rm -rf {0}".format(target))

    sh("{0}/scripts/new_species.pl --species={1}".format(mhome, species))
    sh("{0}/scripts/gff2gbSmallDNA.pl ../{1} ../{2} 1000 raw.gb".\
            format(mhome, gffile, fastafile))
    sh("{0}/bin/etraining --species={1} raw.gb 2> train.err".\
            format(mhome, species))
    sh("cat train.err | perl -pe 's/.*in sequence (\S+): .*/$1/' > badgenes.lst")
    sh("{0}/scripts/filterGenes.pl badgenes.lst raw.gb > training.gb".\
            format(mhome))
    sh("grep -c LOCUS raw.gb training.gb")

    # autoAugTrain failed to execute, disable for now
    if opts.autotrain:
        sh("rm -rf {0}".format(target))
        sh("{0}/scripts/autoAugTrain.pl --trainingset=training.gb --species={1}".\
                format(mhome, species))

    os.chdir(cwd)
    sh("cp -r {0} augustus/".format(target))
Example #39
0
def augustus(args):
    """
    %prog augustus fastafile

    Run parallel AUGUSTUS. Final results can be reformatted using
    annotation.reformat.augustus().
    """
    p = OptionParser(augustus.__doc__)
    p.add_option("--species", default="maize",
                 help="Use species model for prediction")
    p.add_option("--hintsfile", help="Hint-guided AUGUSTUS")
    p.add_option("--nogff3", default=False, action="store_true",
                 help="Turn --gff3=off")
    p.set_home("augustus")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    cpus = opts.cpus
    mhome = opts.augustus_home
    gff3 = not opts.nogff3
    suffix = ".gff3" if gff3 else ".out"
    cfgfile = op.join(mhome, "config/extrinsic/extrinsic.M.RM.E.W.cfg")

    outdir = mkdtemp(dir=".")
    fs = split([fastafile, outdir, str(cpus)])

    augustuswrap_params = partial(augustuswrap, species=opts.species,
                            gff3=gff3, cfgfile=cfgfile,
                            hintsfile=opts.hintsfile)
    g = Jobs(augustuswrap_params, fs.names)
    g.run()

    gff3files = [x.rsplit(".", 1)[0] + suffix for x in fs.names]
    outfile = fastafile.rsplit(".", 1)[0] + suffix
    FileMerger(gff3files, outfile=outfile).merge()
    shutil.rmtree(outdir)

    if gff3:
        from jcvi.annotation.reformat import augustus as reformat_augustus
        reformat_outfile = outfile.replace(".gff3", ".reformat.gff3")
        reformat_augustus([outfile, "--outfile={0}".format(reformat_outfile)])
Example #40
0
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    cleanup = not opts.nocleanup
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, store, cleanup) for x in vcffiles]
    for res in p.map_async(run, run_args).get():
        continue
Example #41
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Example #42
0
def merge(args):
    """
    %prog merge outdir output.gff

    Follow-up command after grid jobs are completed after parallel().
    """
    from jcvi.formats.gff import merge as gmerge

    p = OptionParser(merge.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    outdir, outputgff = args
    fsnames, suffix = get_fsnames(outdir)
    nfs = len(fsnames)
    cmd = op.join(opts.maker_home, "bin/gff3_merge")

    outfile = "merge.sh"
    write_file(outfile, mergesh.format(suffix, cmd))

    # Generate per split directory
    # Note that gff3_merge write to /tmp, so I limit processes here to avoid
    # filling up disk space
    sh("parallel -j 8 merge.sh {} ::: " + " ".join(fsnames))

    # One final output
    gffnames = glob("*.all.gff")
    assert len(gffnames) == nfs

    # Again, DO NOT USE gff3_merge to merge with a smallish /tmp/ area
    gfflist = "gfflist"
    fw = open(gfflist, "w")
    print("\n".join(gffnames), file=fw)
    fw.close()

    nlines = sum(1 for x in open(gfflist))
    assert nlines == nfs  # Be extra, extra careful to include all results
    gmerge([gfflist, "-o", outputgff])
    logging.debug("Merged GFF file written to `{0}`".format(outputgff))
Example #43
0
def locus(args):
    """
    %prog locus bamfile

    Extract selected locus from a list of TREDs for validation, and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam
    # See `Format-lobSTR-database.ipynb` for a list of TREDs for validation
    INCLUDE = ["HD", "SBMA", "SCA1", "SCA2", "SCA8", "SCA17", "DM1", "DM2"]

    p = OptionParser(locus.__doc__)
    p.add_option("--tred", choices=INCLUDE, help="TRED name")
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamfile, = args
    lhome = opts.lobstr_home
    tred = opts.tred

    tredsfile = op.join(datadir, "TREDs.meta.csv")
    tf = pd.read_csv(tredsfile, index_col=0)
    row = tf.ix[tred]
    seqid, start_end = row["repeat_location"].split(":")

    PAD = 1000
    start, end = start_end.split('-')
    start, end = int(start) - PAD, int(end) + PAD
    region = "{}:{}-{}".format(seqid, start, end)

    minibamfile = get_minibam(bamfile, region)
    c = seqid.replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "TREDs")
    sh(cmd)

    parser = LobSTRvcf(columnidsfile=None)
    parser.parse(vcf, filtered=False)
    k, v = parser.items()[0]
    print >> sys.stderr, "{} => {}".format(tred, v.replace(',', '/'))
Example #44
0
def snap(args):
    """
    %prog snap species gffile fastafile

    Train SNAP model given gffile and fastafile. Whole procedure taken from:
    <http://gmod.org/wiki/MAKER_Tutorial_2012>
    """
    p = OptionParser(snap.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    gffile = os.path.abspath(gffile)
    fastafile = os.path.abspath(fastafile)
    mhome = opts.maker_home
    snapdir = "snap"
    mkdir(snapdir)

    cwd = os.getcwd()
    os.chdir(snapdir)

    newgffile = "training.gff3"
    logging.debug("Construct GFF file combined with sequence ...")
    sh("cat {0} > {1}".format(gffile, newgffile))
    sh('echo "##FASTA" >> {0}'.format(newgffile))
    sh("cat {0} >> {1}".format(fastafile, newgffile))

    logging.debug("Make models ...")
    sh("{0}/src/bin/maker2zff training.gff3".format(mhome))
    sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format(
        mhome))
    sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome))
    sh("{0}/exe/snap/forge export.ann export.dna".format(mhome))
    sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species))

    os.chdir(cwd)
    logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format(
        snapdir, species))
Example #45
0
def close(args):
    """
    %prog close scaffolds.fasta PE*.fastq

    Run GapFiller to fill gaps.
    """
    p = OptionParser(close.__doc__)
    p.set_home("gapfiller")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    scaffolds = args[0]
    libtxt = write_libraries(args[1:], aligner="bwa")

    cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #46
0
def close(args):
    """
    %prog close scaffolds.fasta PE*.fastq

    Run GapFiller to fill gaps.
    """
    p = OptionParser(close.__doc__)
    p.set_home("gapfiller")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    scaffolds = args[0]
    libtxt = write_libraries(args[1:], aligner="bwa")

    cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #47
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:])

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #48
0
def prepare(args):
    """
    %prog prepare alignAssembly.config est.fasta ref.fasta

    Generate PASA run script.
    """
    p = OptionParser(prepare.__doc__)
    p.set_home("pasa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    cfg, est, ref = args
    phome = opts.pasa_home
    cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl")
    cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus)
    cmd += " -C -R --ALIGNERS blat,gmap"
    cmd += " -t {0} -g {1}".format(est, ref)
    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Example #49
0
def htt(args):
    """
    %prog htt bamfile chr4:3070000-3080000

    Extract HTT region and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam

    p = OptionParser(htt.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, region = args
    lhome = opts.lobstr_home

    minibamfile = get_minibam(bamfile, region)
    c = region.split(":")[0].replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "hg38")
    sh(cmd)
Example #50
0
def prepare(args):
    """
    %prog prepare alignAssembly.config est.fasta ref.fasta

    Generate PASA run script.
    """
    p = OptionParser(prepare.__doc__)
    p.set_home("pasa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    cfg, est, ref = args
    phome = opts.pasa_home
    cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl")
    cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus)
    cmd += " -C -R --ALIGNERS blat,gmap"
    cmd += " -t {0} -g {1}".format(est, ref)
    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Example #51
0
def htt(args):
    """
    %prog htt bamfile chr4:3070000-3080000

    Extract HTT region and run lobSTR.
    """
    from jcvi.formats.sam import get_minibam

    p = OptionParser(htt.__doc__)
    p.set_home("lobstr")
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, region = args
    lhome = opts.lobstr_home

    minibamfile = get_minibam(bamfile, region)
    c = region.split(":")[0].replace("chr", "")
    cmd, vcf = allelotype_on_chr(minibamfile, c, lhome, "hg38")
    sh(cmd)
Example #52
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.add_option("--junctions", help="Path to `junctions.bed` from Tophat2")
    p.set_home("gmes")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    junctions = opts.junctions
    mhome = opts.gmes_home

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gmes_petap.pl --sequence {1}".format(mhome, fastafile)
    cmd += " --cores {0}".format(opts.cpus)
    if junctions:
        intronsgff = "introns.gff"
        if need_update(junctions, intronsgff):
            jcmd = "{0}/bet_to_gff.pl".format(mhome)
            jcmd += " --bed {0} --gff {1} --label Tophat2".\
                    format(junctions, intronsgff)
            sh(jcmd)
        cmd += " --ET {0} --et_score 10".format(intronsgff)
    else:
        cmd += " --ES"
    sh(cmd)

    logging.debug("GENEMARK matrix written to `output/gmhmm.mod")
Example #53
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.add_option("--junctions", help="Path to `junctions.bed` from Tophat2")
    p.set_home("gmes")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    junctions = opts.junctions
    mhome = opts.gmes_home

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gmes_petap.pl --sequence {1}".format(mhome, fastafile)
    cmd += " --cores {0}".format(opts.cpus)
    if junctions:
        intronsgff = "introns.gff"
        if need_update(junctions, intronsgff):
            jcmd = "{0}/bet_to_gff.pl".format(mhome)
            jcmd += " --bed {0} --gff {1} --label Tophat2".\
                    format(junctions, intronsgff)
            sh(jcmd)
        cmd += " --ET {0} --et_score 10".format(intronsgff)
    else:
        cmd += " --ES"
    sh(cmd)

    logging.debug("GENEMARK matrix written to `output/gmhmm.mod")
Example #54
0
File: train.py Project: rrane/jcvi
def snap(args):
    """
    %prog snap species gffile fastafile

    Train SNAP model given gffile and fastafile. Whole procedure taken from:
    <http://gmod.org/wiki/MAKER_Tutorial_2012>
    """
    p = OptionParser(snap.__doc__)
    p.set_home("maker")
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    species, gffile, fastafile = args
    mhome = opts.maker_home
    snapdir = "snap"
    mkdir(snapdir)

    cwd = os.getcwd()
    os.chdir(snapdir)

    newgffile = "training.gff3"
    logging.debug("Construct GFF file combined with sequence ...")
    sh("cat ../{0} > {1}".format(gffile, newgffile))
    sh('echo "##FASTA" >> {0}'.format(newgffile))
    sh("cat ../{0} >> {1}".format(fastafile, newgffile))

    logging.debug("Make models ...")
    sh("{0}/bin/maker2zff training.gff3".format(mhome))
    sh("{0}/exe/snap/fathom -categorize 1000 genome.ann genome.dna".format(mhome))
    sh("{0}/exe/snap/fathom -export 1000 -plus uni.ann uni.dna".format(mhome))
    sh("{0}/exe/snap/forge export.ann export.dna".format(mhome))
    sh("{0}/exe/snap/hmm-assembler.pl {1} . > {1}.hmm".format(mhome, species))

    os.chdir(cwd)
    logging.debug("SNAP matrix written to `{0}/{1}.hmm`".format(snapdir, species))