Example #1
0
File: align.py Project: rrane/jcvi
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=None, evalue=.01)
    p.add_option("--wordsize", type="int", help="Word size [default: %default]")
    p.add_option("--best", default=1, type="int",
            help="Only look for best N hits [default: %default]")
    p.add_option("--task", default="megablast", choices=task_choices,
            help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    q = op.basename(queryfasta).split(".")[0]
    r = op.basename(reffasta).split(".")[0]
    blastfile = "{0}.{1}.blast".format(q, r)

    run_megablast(infile=queryfasta, outfile=blastfile, db=reffasta,
                  wordsize=opts.wordsize, pctid=opts.pctid, evalue=opts.evalue,
                  hitlen=None, best=opts.best, task=opts.task, cpus=opts.cpus)

    return blastfile
Example #2
0
def batchoverlap(args):
    """
    %prog batchoverlap pairs.txt outdir

    Check overlaps between pairs of sequences.
    """
    p = OptionParser(batchoverlap.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    pairsfile, outdir = args
    fp = open(pairsfile)
    cmds = []
    mkdir("overlaps")
    for row in fp:
        a, b = row.split()[:2]
        oa = op.join(outdir, a + ".fa")
        ob = op.join(outdir, b + ".fa")
        cmd = "python -m jcvi.assembly.goldenpath overlap {0} {1}".format(oa, ob)
        cmd += " -o overlaps/{0}_{1}.ov".format(a, b)
        cmds.append(cmd)

    print "\n".join(cmds)
Example #3
0
def filtervcf(args):
    """
    %prog filtervcf NA12878.hg38.vcf.gz

    Filter lobSTR VCF using script shipped in lobSTR. Input file can be a list
    of vcf files.
    """
    p = OptionParser(filtervcf.__doc__)
    p.set_home("lobstr", default="/mnt/software/lobSTR")
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    lhome = opts.lobstr_home
    store = opts.output_path

    if samples.endswith((".vcf", ".vcf.gz")):
        vcffiles = [samples]
    else:
        vcffiles = [x.strip() for x in must_open(samples)]

    vcffiles = [x for x in vcffiles if ".filtered." not in x]

    run_args = [(x, lhome, x.startswith("s3://") and store) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_filter, run_args).get():
        continue
Example #4
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    os.chdir(folder)
    bams = glob("*tophat/accepted_hits.bam")
    for bam in bams:
        pf, ab = op.split(bam)
        outdir = op.join(pf, "cufflinks")
        if op.exists(outdir):
            logging.debug("Directory {0} found. Skipping.".format(outdir))
            continue
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -g {0}".format(opts.gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        sh(cmd)
Example #5
0
def blat(args):
    """
    %prog blat ref.fasta query.fasta

    Calls blat and filters BLAST hits.
    """
    p = OptionParser(blat.__doc__)
    p.set_align(pctid=95, hitlen=30)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta, suffix="blat")

    run_blat(
        infile=queryfasta,
        outfile=blastfile,
        db=reffasta,
        pctid=opts.pctid,
        hitlen=opts.hitlen,
        cpus=opts.cpus,
        overwrite=False,
    )

    return blastfile
Example #6
0
def filterdata(args):
    """
    %prog filterdata data.bin samples.ids STR.ids allele_freq remove.ids final.ids

    Filter subset of data after dropping remove.ids.
    """
    p = OptionParser(filterdata.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 6:
        sys.exit(not p.print_help())

    binfile, sampleids, strids, af, remove, final = args
    df, m, samples, loci = read_binfile(binfile, sampleids, strids)
    remove = [x.strip() for x in open(remove)]
    removes = set(remove)
    final = [x.strip() for x in open(final)]
    assert len(loci) == len(remove) + len(final)

    fp = open(af)
    percentiles = {}
    for row in fp:
        sname, counts = row.split()
        countsd = af_to_counts(counts)
        percentile = counts_to_percentile(countsd)
        percentiles[sname] = percentile

    run_args = []
    for i, sname in enumerate(loci):
        if sname in removes:
            continue
        a = m[:, i]
        percentile = percentiles[sname]
        run_args.append((i, a, percentile))

    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    res = []
    for r in p.map_async(convert_to_percentile, run_args).get():
        res.append(r)
    res.sort()

    # Write mask (P-value) matrix
    ii, pvalues = zip(*res)
    m = np.vstack(pvalues).T
    write_csv("final.mask.tsv", m, samples, final)

    df.drop(remove, inplace=True, axis=1)
    df.columns = final

    # Save a copy of the raw numpy array
    filtered_bin = "filtered.bin"
    m = df.as_matrix()
    m[m < 0] = -1
    m.tofile(filtered_bin)
    logging.debug("Binary matrix written to `{}`".format(filtered_bin))

    # Write data output
    df.to_csv("final.data.tsv", sep="\t", index_label="SampleKey")
Example #7
0
File: cdhit.py Project: rrane/jcvi
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment [%default: %default]")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Example #8
0
File: sam.py Project: arvin580/jcvi
def count(args):
    """
    %prog count bamfile gtf

    Count the number of reads mapped using `htseq-count`.
    """
    p = OptionParser(count.__doc__)
    p.add_option("--type", default="exon",
                 help="Only count feature type")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, gtf = args
    cpus = opts.cpus
    pf = bamfile.split(".")[0]
    countfile = pf + ".count"
    if not need_update(bamfile, countfile):
        return

    nsorted = pf + "_nsorted"
    nsortedbam, nsortedsam = nsorted + ".bam", nsorted + ".sam"
    if need_update(bamfile, nsortedsam):
        cmd = "samtools sort -@ {0} -n {1} {2}".format(cpus, bamfile, nsorted)
        sh(cmd)
        cmd = "samtools view -@ {0} -h {1}".format(cpus, nsortedbam)
        sh(cmd, outfile=nsortedsam)

    if need_update(nsortedsam, countfile):
        cmd = "htseq-count --stranded=no --minaqual=10"
        cmd += " -t {0}".format(opts.type)
        cmd += " {0} {1}".format(nsortedsam, gtf)
        sh(cmd, outfile=countfile)
Example #9
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #10
0
File: gmap.py Project: fw1121/jcvi
def align(args):
    """
    %prog align database.fasta read1.fq read2.fq

    Wrapper for `gsnap` single-end or paired-end, depending on the number of
    args.
    """
    from jcvi.formats.fasta import join
    from jcvi.formats.fastq import guessoffset
    from jcvi.projects.tgbs import snp

    p = OptionParser(align.__doc__)
    p.add_option("--join", default=False, action="store_true",
                 help="Join sequences with padded 50Ns")
    p.add_option("--rnaseq", default=False, action="store_true",
                 help="Input is RNA-seq reads, turn splicing on")
    p.add_option("--snp", default=False, action="store_true",
                 help="Call SNPs after GSNAP")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        logging.debug("Single-end alignment")
    elif len(args) == 3:
        logging.debug("Paired-end alignment")
    else:
        sys.exit(not p.print_help())

    dbfile, readfile = args[0:2]
    if opts.join:
        dbfile = join([dbfile, "--gapsize=50", "--newid=chr1"])

    assert op.exists(dbfile) and op.exists(readfile)
    prefix = get_prefix(readfile, dbfile)
    logfile = prefix + ".log"
    gsnapfile = prefix + ".gsnap"
    if not need_update((dbfile, readfile), gsnapfile):
        logging.error("`{0}` exists. `gsnap` already run.".format(gsnapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gsnap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -B 5 -m 0.1 -i 2 -n 3"  # memory, mismatch, indel penalty, nhits
        if opts.rnaseq:
            cmd += " -N 1"
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --gmap-mode none --nofails"
        if readfile.endswith(".gz"):
            cmd += " --gunzip"
        try:
            offset = "sanger" if guessoffset([readfile]) == 33 else "illumina"
            cmd += " --quality-protocol {0}".format(offset)
        except AssertionError:
            pass
        cmd += " " + " ".join(args[1:])
        sh(cmd, outfile=gsnapfile, errfile=logfile)

    if opts.snp:
        snp([gsnapfile, "--cpus={0}".format(opts.cpus)])

    return gsnapfile, logfile
Example #11
0
def extract(args):
    """
    %prog extract bamfile contig

    Extract sub-bam for just one contig.
    """
    p = OptionParser(extract.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, contig = args
    cpus = opts.cpus
    pf = bamfile.split(".")[0]
    outfile = ".".join((contig.split("|")[0], pf, "bam"))
    if op.exists(outfile):
        logging.error("Output name exists: `{}`".format(outfile))
        return

    if need_update(bamfile, outfile):
        cmd = 'samtools view {} "{}" -@ {}'.format(bamfile, contig, cpus)
        cmd += " -b -o {}".format(outfile)
        sh(cmd)
    index([outfile, "--cpus={}".format(cpus)])
Example #12
0
File: cnv.py Project: xuanblo/jcvi
def cib(args):
    """
    %prog cib bamfile samplekey

    Convert BAM to CIB (a binary storage of int8 per base).
    """
    p = OptionParser(cib.__doc__)
    p.add_option("--prefix", help="Report seqids with this prefix only")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bamfile, samplekey = args
    mkdir(samplekey)
    bam = pysam.AlignmentFile(bamfile, "rb")
    refs = [x for x in bam.header["SQ"]]
    prefix = opts.prefix
    if prefix:
        refs = [x for x in refs if x["SN"].startswith(prefix)]

    task_args = []
    for r in refs:
        task_args.append((bamfile, r, samplekey))
    cpus = min(opts.cpus, len(task_args))
    logging.debug("Use {} cpus".format(cpus))

    p = Pool(processes=cpus)
    for res in p.imap(bam_to_cib, task_args):
        continue
Example #13
0
File: cnv.py Project: xuanblo/jcvi
def gcn(args):
    """
    %prog gcn gencode.v26.exonunion.bed data/*.vcf.gz

    Compile gene copy njumber based on CANVAS results.
    """
    p = OptionParser(gcn.__doc__)
    p.set_cpus()
    p.set_tmpdir(tmpdir="tmp")
    p.set_outfile()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    exonbed = args[0]
    canvasvcfs = args[1:]
    tsvfile = opts.outfile
    tmpdir = opts.tmpdir

    mkdir(tmpdir)
    set_tempdir(tmpdir)

    df = vcf_to_df(canvasvcfs, exonbed, opts.cpus)
    for suffix in (".avgcn", ".medcn"):
        df_to_tsv(df, tsvfile, suffix)
Example #14
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option("--minscore", default=100, type="int",
                 help="Matches minus mismatches gap penalty [default: %default]")
    p.add_option("--minid", default=98, type="int",
                 help="Minimum sequence identity [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".\
                format(opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split('.')[0] \
                for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Example #15
0
File: tgbs.py Project: fw1121/jcvi
def bam(args):
    """
    %prog snp input.gsnap ref.fasta

    Convert GSNAP output to BAM.
    """
    from jcvi.formats.sizes import Sizes
    from jcvi.formats.sam import index

    p = OptionParser(bam.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    gsnapfile, fastafile = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    uniqsam = pf + ".unique.sam"
    if need_update((gsnapfile, fastafile), uniqsam):
        cmd = op.join(EYHOME, "gsnap2gff3.pl")
        sizesfile = Sizes(fastafile).filename
        cmd += " --format sam -i {0} -o {1}".format(gsnapfile, uniqsam)
        cmd += " -u -l {0} -p {1}".format(sizesfile, opts.cpus)
        sh(cmd)

    index([uniqsam])
Example #16
0
File: tgbs.py Project: fw1121/jcvi
def snp(args):
    """
    %prog snp input.gsnap

    Run SNP calling on GSNAP output after apps.gsnap.align().
    """
    p = OptionParser(snp.__doc__)
    p.set_home("eddyyeh")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    gsnapfile, = args
    EYHOME = opts.eddyyeh_home
    pf = gsnapfile.rsplit(".", 1)[0]
    nativefile = pf + ".native"
    if need_update(gsnapfile, nativefile):
        cmd = op.join(EYHOME, "convert2native.pl")
        cmd += " --gsnap {0} -o {1}".format(gsnapfile, nativefile)
        cmd += " -proc {0}".format(opts.cpus)
        sh(cmd)

    snpfile = pf + ".snp"
    if need_update(nativefile, snpfile):
        cmd = op.join(EYHOME, "SNPs/SNP_Discovery-short.pl")
        cmd += " --native {0} -o {1}".format(nativefile, snpfile)
        cmd += " -a 2 -ac 0.3 -c 0.8"
        sh(cmd)
Example #17
0
File: hic.py Project: xuanblo/jcvi
def density(args):
    """
    %prog density test.clm

    Estimate link density of contigs.
    """
    p = OptionParser(density.__doc__)
    p.add_option("--save", default=False, action="store_true",
                 help="Write log densitites of contigs to file")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clmfile, = args
    clm = CLMFile(clmfile)
    pf = clmfile.rsplit(".", 1)[0]

    if opts.save:
        logdensities = clm.calculate_densities()
        densityfile = pf + ".density"
        fw = open(densityfile, "w")
        for name, logd in logdensities.items():
            s = clm.tig_to_size[name]
            print >> fw, "\t".join(str(x) for x in (name, s, logd))
        fw.close()
        logging.debug("Density written to `{}`".format(densityfile))

    tourfile = clmfile.rsplit(".", 1)[0] + ".tour"
    tour = clm.activate(tourfile=tourfile, backuptour=False)
    clm.flip_all(tour)
    clm.flip_whole(tour)
    clm.flip_one(tour)
Example #18
0
def beagle(args):
    """
    %prog beagle input.vcf 1

    Use BEAGLE4.1 to impute vcf on chromosome 1.
    """
    p = OptionParser(beagle.__doc__)
    p.set_home("beagle")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    vcffile, chr = args
    pf = vcffile.rsplit(".", 1)[0]
    outpf = pf + ".beagle"
    outfile = outpf + ".vcf.gz"

    mm = MakeManager()
    beagle_cmd = opts.beagle_home
    kg = op.join(opts.ref, "1000GP_Phase3")
    cmd = beagle_cmd + " gt={0}".format(vcffile)
    cmd += " ref={0}/chr{1}.1kg.phase3.v5a.bref".format(kg, chr)
    cmd += " map={0}/plink.chr{1}.GRCh37.map".format(kg, chr)
    cmd += " out={0}".format(outpf)
    cmd += " nthreads=16 gprobs=true"
    mm.add(vcffile, outfile, cmd)

    mm.write()
Example #19
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size
    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int",
                 help="K-mer size [default: %default]")
    p.add_option("--coverage", default=40, type="int",
                 help="Expected sequence coverage [default: %default]")
    p.add_option("--prefix", default="jf",
                 help="Database prefix [default: %default]")
    p.add_option("--nohist", default=False, action="store_true",
                 help="Do not print histogram [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".\
                    format(human_size(totalfilesize,
                           a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    cmd = "jellyfish count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = "jellyfish histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)
Example #20
0
def impute(args):
    """
    %prog impute input.vcf hs37d5.fa 1

    Use IMPUTE2 to impute vcf on chromosome 1.
    """
    from pyfaidx import Fasta

    p = OptionParser(impute.__doc__)
    p.set_home("shapeit")
    p.set_home("impute")
    p.set_ref()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    vcffile, fastafile, chr = args
    mm = MakeManager()
    pf = vcffile.rsplit(".", 1)[0]
    hapsfile = pf + ".haps"
    kg = op.join(opts.ref, "1000GP_Phase3")
    shapeit_phasing(mm, chr, vcffile, opts)

    fasta = Fasta(fastafile)
    size = len(fasta[chr])
    binsize = 5000000
    bins = size / binsize  # 5Mb bins
    if size % binsize:
        bins += 1
    impute_cmd = op.join(opts.impute_home, "impute2")
    chunks = []
    for x in xrange(bins + 1):
        chunk_start = x * binsize + 1
        chunk_end = min(chunk_start + binsize - 1, size)
        outfile = pf + ".chunk{0:02d}.impute2".format(x)
        mapfile = "{0}/genetic_map_chr{1}_combined_b37.txt".format(kg, chr)
        rpf = "{0}/1000GP_Phase3_chr{1}".format(kg, chr)
        cmd = impute_cmd + " -m {0}".format(mapfile)
        cmd += " -known_haps_g {0}".format(hapsfile)
        cmd += " -h {0}.hap.gz -l {0}.legend.gz".format(rpf)
        cmd += " -Ne 20000 -int {0} {1}".format(chunk_start, chunk_end)
        cmd += " -o {0} -allow_large_regions -seed 367946".format(outfile)
        cmd += " && touch {0}".format(outfile)
        mm.add(hapsfile, outfile, cmd)
        chunks.append(outfile)

    # Combine all the files
    imputefile = pf + ".impute2"
    cmd = "cat {0} > {1}".format(" ".join(chunks), imputefile)
    mm.add(chunks, imputefile, cmd)

    # Convert to vcf
    vcffile = pf + ".impute2.vcf"
    cmd = "python -m jcvi.formats.vcf fromimpute2 {0} {1} {2} > {3}".\
                format(imputefile, fastafile, chr, vcffile)
    mm.add(imputefile, vcffile, cmd)
    mm.write()
Example #21
0
def last(args):
    """
    %prog database.fasta query.fasta

    Run LAST by calling LASTDB and LASTAL. LAST program available:
    <http://last.cbrc.jp>

    Works with LAST-719.
    """
    p = OptionParser(last.__doc__)
    p.add_option("--path", help="Specify LAST path")
    p.add_option("--mask", default=False, action="store_true", help="Invoke -c in lastdb")
    p.add_option("--format", default="BlastTab",
                 choices=("TAB", "MAF", "BlastTab", "BlastTab+"),
                 help="Output format")
    p.add_option("--minlen", default=0, type="int",
                 help="Filter alignments by how many bases match")
    p.add_option("--minid", default=0, type="int", help="Minimum sequence identity")
    p.set_cpus()
    p.set_params()

    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    subject, query = args
    path = opts.path
    cpus = opts.cpus
    getpath = lambda x: op.join(path, x) if path else x
    lastdb_bin = getpath("lastdb")
    lastal_bin = getpath("lastal")

    subjectdb = subject.rsplit(".", 1)[0]
    run_lastdb(infile=subject, outfile=subjectdb + ".prj", mask=opts.mask, \
              lastdb_bin=lastdb_bin)

    u = 2 if opts.mask else 0
    cmd = "{0} -u {1}".format(lastal_bin, u)
    cmd += " -P {0} -i3G".format(cpus)
    cmd += " -f {0}".format(opts.format)
    cmd += " {0} {1}".format(subjectdb, query)

    minlen = opts.minlen
    minid = opts.minid
    extra = opts.extra
    assert minid != 100, "Perfect match not yet supported"
    mm = minid / (100 - minid)

    if minlen:
        extra += " -e{0}".format(minlen)
    if minid:
        extra += " -r1 -q{0} -a{0} -b{0}".format(mm)
    if extra:
        cmd += " " + extra.strip()

    lastfile = get_outfile(subject, query, suffix="last")
    sh(cmd, outfile=lastfile)
Example #22
0
File: sam.py Project: arvin580/jcvi
def index(args):
    """
    %prog index samfile/bamfile

    If SAM file, convert to BAM, sort and then index, using SAMTOOLS
    """
    p = OptionParser(index.__doc__)
    p.add_option("--fasta", dest="fasta", default=None,
            help="add @SQ header to the BAM file [default: %default]")
    p.add_option("--unique", default=False, action="store_true",
            help="only retain uniquely mapped reads [default: %default]")
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(p.print_help())

    samfile, = args
    cpus = opts.cpus
    fastafile = opts.fasta
    if fastafile:
        assert op.exists(fastafile)

    bamfile = samfile.replace(".sam", ".bam")
    if fastafile:
        faifile = fastafile + ".fai"
        if need_update(fastafile, faifile):
            sh("samtools faidx {0}".format(fastafile))
        cmd = "samtools view -bt {0} {1} -F 4 -o {2}".\
                format(faifile, samfile, bamfile)
    else:
        cmd = "samtools view -bS {0} -F 4 -o {1}".\
                format(samfile, bamfile)

    cmd += " -@ {0}".format(cpus)
    if opts.unique:
        cmd += " -q 1"

    if samfile.endswith(".sam") and need_update(samfile, bamfile):
        sh(cmd)

    # Already sorted?
    if bamfile.endswith(".sorted.bam"):
        sortedbamfile = bamfile
    else:
        prefix = bamfile.replace(".bam", "")
        sortedbamfile = prefix + ".sorted.bam"

    if need_update(bamfile, sortedbamfile):
        cmd = "samtools sort {0} {1}.sorted".format(bamfile, prefix)
        cmd += " -@ {0}".format(cpus)
        sh(cmd)

    baifile = sortedbamfile + ".bai"
    if need_update(sortedbamfile, baifile):
        sh("samtools index {0}".format(sortedbamfile))

    return sortedbamfile
Example #23
0
def mito(args):
    """
    %prog mito chrM.fa input.bam

    Identify mitochondrial deletions.
    """
    p = OptionParser(mito.__doc__)
    p.set_aws_opts(store="hli-mv-data-science/htang/mito-deletions")
    p.add_option("--realignonly", default=False, action="store_true",
                 help="Realign only")
    p.add_option("--svonly", default=False, action="store_true",
                 help="Run Realign => SV calls only")
    p.add_option("--support", default=1, type="int",
                 help="Minimum number of supporting reads")
    p.set_home("speedseq", default="/mnt/software/speedseq/bin")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    chrMfa, bamfile = args
    store = opts.output_path
    cleanup = not opts.nocleanup

    if not op.exists(chrMfa):
        logging.debug("File `{}` missing. Exiting.".format(chrMfa))
        return

    chrMfai = chrMfa + ".fai"
    if not op.exists(chrMfai):
        cmd = "samtools index {}".format(chrMfa)
        sh(cmd)

    if not bamfile.endswith(".bam"):
        bamfiles = [x.strip() for x in open(bamfile)]
    else:
        bamfiles = [bamfile]

    if store:
        computed = ls_s3(store)
        computed = [op.basename(x).split('.')[0] for x in computed if \
                        x.endswith(".depth")]
        remaining_samples = [x for x in bamfiles \
                    if op.basename(x).split(".")[0] not in computed]

        logging.debug("Already computed on `{}`: {}".\
                        format(store, len(bamfiles) - len(remaining_samples)))
        bamfiles = remaining_samples

    logging.debug("Total samples: {}".format(len(bamfiles)))

    for bamfile in bamfiles:
        run_mito(chrMfa, bamfile, opts,
                 realignonly=opts.realignonly,
                 svonly=opts.svonly,
                 store=store, cleanup=cleanup)
Example #24
0
def tophat(args):
    """
    %prog tophat folder reference

    Run tophat on a folder of reads.
    """
    from jcvi.apps.bowtie import check_index
    from jcvi.formats.fastq import guessoffset

    p = OptionParser(tophat.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.add_option("--single", default=False, action="store_true",
                 help="Single end mapping")
    p.add_option("--intron", default=15000, type="int",
                 help="Max intron size [default: %default]")
    p.add_option("--dist", default=-50, type="int",
                 help="Mate inner distance [default: %default]")
    p.add_option("--stdev", default=50, type="int",
                 help="Mate standard deviation [default: %default]")
    p.set_phred()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    num = 1 if opts.single else 2
    folder, reference = args
    reference = check_index(reference)
    for p, prefix in iter_project(folder, n=num):
        outdir = "{0}_tophat".format(prefix)
        outfile = op.join(outdir, "accepted_hits.bam")
        if op.exists(outfile):
            logging.debug("File `{0}` found. Skipping.".format(outfile))
            continue

        cmd = "tophat -p {0}".format(opts.cpus)
        if opts.gtf:
            cmd += " -G {0}".format(opts.gtf)
        cmd += " -o {0}".format(outdir)

        if num == 1:  # Single-end
            a, = p
        else:  # Paired-end
            a, b = p
            cmd += " --max-intron-length {0}".format(opts.intron)
            cmd += " --mate-inner-dist {0}".format(opts.dist)
            cmd += " --mate-std-dev {0}".format(opts.stdev)

        phred = opts.phred or str(guessoffset([a]))
        if phred == "64":
            cmd += " --phred64-quals"
        cmd += " {0} {1}".format(reference, " ".join(p))

        sh(cmd)
Example #25
0
def prepare(args):
    """
    %prog prepare genomesize *.fastq

    Prepare MERACULOUS configuation file. Genome size should be entered in Mb.
    """
    p = OptionParser(prepare.__doc__ + FastqNamings)
    p.add_option("-K", default=51, type="int", help="K-mer size")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    genomesize = float(args[0]) / 1000
    fnames = args[1:]
    for x in fnames:
        assert op.exists(x), "File `{0}` not found.".format(x)

    s = comment_banner("Meraculous params file") + "\n"
    s += comment_banner("Basic parameters") + "\n"
    s += "# Describe the libraries ( one line per library )\n"
    s += "# " + " ".join(header.split()) + "\n"

    libs = get_libs(fnames)
    lib_seqs = []
    rank = 0
    for lib, fs in libs:
        size = lib.size
        if size == 0:
            continue
        rank += 1
        library_name = lib.library_name
        name = library_name.replace("-", "")
        wildcard = "{0}*.1.*,{0}*.2.*".format(library_name)
        rl = max(readlen([x]) for x in fs)
        lib_seq = lib.get_lib_seq(wildcard, name, rl, rank)
        lib_seqs.append(lib_seq)

    s += "\n" + "\n".join(load_csv(None, lib_seqs, sep=" ")) + "\n"
    params = [("genome_size", genomesize),
              ("is_diploid", 0),
              ("mer_size", opts.K),
              ("num_prefix_blocks", 1),
              ("no_read_validation", 0),
              ("local_num_procs", opts.cpus)]
    s += "\n" + "\n".join(load_csv(None, params, sep=" ")) + "\n"

    cfgfile = "meraculous.config"
    write_file(cfgfile, s, tee=True)

    s = "~/export/meraculous/bin/run_meraculous.sh -c {0}"\
                .format(cfgfile)
    runsh = "run.sh"
    write_file(runsh, s)
Example #26
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=96, pctcov=0)
    p.add_option("--fast", default=False, action="store_true",
                 help="Place sequence in the first cluster")
    p.add_option("--consensus", default=False, action="store_true",
                 help="Compute consensus sequences")
    p.add_option("--reads", default=False, action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand", default=False, action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.
    fastafile, qualfile = fasta([fastafile, "--seqtk"])

    ocmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, ocmd)
    cmd += " -c {0}".format(identity)
    if ocmd == "cd-hit-est":
        cmd += " -d 0"  # include complete defline
        if opts.samestrand:
            cmd += " -r 0"
    if not opts.fast:
        cmd += " -g 1"
    if opts.pctcov != 0:
        cmd += " -aL {0} -aS {0}".format(opts.pctcov / 100.)

    dd = fastafile + ".P{0}.cdhit".format(opts.pctid)
    clstr = dd + ".clstr"

    cmd += " -M 0 -T {0} -i {1} -o {2}".format(opts.cpus, fastafile, dd)
    if need_update(fastafile, (dd, clstr)):
        sh(cmd)

    if opts.consensus:
        cons = dd + ".consensus"
        cmd = op.join(opts.cdhit_home, "cdhit-cluster-consensus")
        cmd += " clustfile={0} fastafile={1} output={2} maxlen=1".\
                    format(clstr, fastafile, cons)
        if need_update((clstr, fastafile), cons):
            sh(cmd)

    return dd
Example #27
0
def blasr(args):
    """
    %prog blasr ref.fasta fofn

    Run blasr on a set of PacBio reads. This is based on a divide-and-conquer
    strategy described below.
    """
    from jcvi.apps.grid import MakeManager
    from jcvi.utils.iter import grouper

    p = OptionParser(blasr.__doc__)
    p.set_cpus(cpus=8)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, fofn = args
    flist = sorted([x.strip() for x in open(fofn)])
    h5list = []
    mm = MakeManager()
    for i, fl in enumerate(grouper(flist, 3)):
        chunkname = "chunk{0:03d}".format(i)
        fn = chunkname + ".fofn"
        h5 = chunkname + ".cmp.h5"
        fw = open(fn, "w")
        print >> fw, "\n".join(fl)
        fw.close()

        cmd = "pbalign {0} {1} {2}".format(fn, reffasta, h5)
        cmd += " --nproc {0} --forQuiver --tmpDir .".format(opts.cpus)
        mm.add((fn, reffasta), h5, cmd)
        h5list.append(h5)

    # Merge h5, sort and repack
    allh5 = "all.cmp.h5"
    tmph5 = "tmp.cmp.h5"
    cmd_merge = "cmph5tools.py merge --outFile {0}".format(allh5)
    cmd_merge += " " + " ".join(h5list)
    cmd_sort = "cmph5tools.py sort --deep {0} --tmpDir .".format(allh5)
    cmd_repack = "h5repack -f GZIP=1 {0} {1}".format(allh5, tmph5)
    cmd_repack += " && mv {0} {1}".format(tmph5, allh5)
    mm.add(h5list, allh5, [cmd_merge, cmd_sort, cmd_repack])

    # Quiver
    pf = reffasta.rsplit(".", 1)[0]
    variantsgff = pf + ".variants.gff"
    consensusfasta = pf + ".consensus.fasta"
    cmd_faidx = "samtools faidx {0}".format(reffasta)
    cmd = "quiver -j 32 {0}".format(allh5)
    cmd += " -r {0} -o {1} -o {2}".format(reffasta, variantsgff, consensusfasta)
    mm.add(allh5, consensusfasta, [cmd_faidx, cmd])

    mm.write()
Example #28
0
File: hic.py Project: xuanblo/jcvi
def optimize(args):
    """
    %prog optimize test.clm

    Optimize the contig order and orientation, based on CLM file.
    """
    p = OptionParser(optimize.__doc__)
    p.add_option("--skiprecover", default=False, action="store_true",
                 help="Do not import 'recover' contigs")
    p.add_option("--startover", default=False, action="store_true",
                 help="Do not resume from existing tour file")
    p.add_option("--skipGA", default=False, action="store_true",
                 help="Skip GA step")
    p.set_outfile(outfile=None)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clmfile, = args
    startover = opts.startover
    runGA = not opts.skipGA
    cpus = opts.cpus

    # Load contact map
    clm = CLMFile(clmfile, skiprecover=opts.skiprecover)

    tourfile = opts.outfile or clmfile.rsplit(".", 1)[0] + ".tour"
    if startover:
        tourfile = None
    tour = clm.activate(tourfile=tourfile)

    fwtour = open(tourfile, "w")
    # Store INIT tour
    print_tour(fwtour, clm.tour, "INIT",
               clm.active_contigs, clm.oo, signs=clm.signs)

    if runGA:
        for phase in range(1, 3):
            tour = optimize_ordering(fwtour, clm, phase, cpus)
            tour = clm.prune_tour(tour, cpus)

    # Flip orientations
    phase = 1
    while True:
        tag1, tag2 = optimize_orientations(fwtour, clm, phase, cpus)
        if tag1 == REJECT and tag2 == REJECT:
            logging.debug("Terminating ... no more {}".format(ACCEPT))
            break
        phase += 1

    fwtour.close()
Example #29
0
def cluster(args):
    """
    %prog cluster prefix fastqfiles

    Use `vsearch` to remove duplicate reads. This routine is heavily influenced
    by PyRAD: <https://github.com/dereneaton/pyrad>.
    """
    p = OptionParser(cluster.__doc__)
    add_consensus_options(p)
    p.set_align(pctid=95)
    p.set_outdir()
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    prefix = args[0]
    fastqfiles = args[1:]
    cpus = opts.cpus
    pctid = opts.pctid
    mindepth = opts.mindepth
    minlength = opts.minlength
    fastafile, qualfile = fasta(fastqfiles + ["--seqtk",
                                "--outdir={0}".format(opts.outdir),
                                "--outfile={0}".format(prefix + ".fasta")])

    prefix = op.join(opts.outdir, prefix)
    pf = prefix + ".P{0}".format(pctid)
    derepfile = prefix + ".derep"
    if need_update(fastafile, derepfile):
        derep(fastafile, derepfile, minlength, cpus)

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(derepfile, userfile):
        cluster_smallmem(derepfile, userfile, notmatchedfile,
                         minlength, pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((derepfile, userfile, notmatchedfile), clustfile):
        makeclust(derepfile, userfile, notmatchedfile, clustfile,
                  mindepth=mindepth)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus)

    statsfile = pf + ".stats"
    if need_update(clustSfile, statsfile):
        makestats(clustSfile, statsfile, mindepth=mindepth)
Example #30
0
def cufflinks(args):
    """
    %prog cufflinks folder reference

    Run cufflinks on a folder containing tophat results.
    """
    p = OptionParser(cufflinks.__doc__)
    p.add_option("--gtf", help="Reference annotation [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    folder, reference = args
    cpus = opts.cpus
    gtf = opts.gtf
    transcripts = "transcripts.gtf"

    mm = MakeManager()
    gtfs = []
    for bam in iglob(folder, "*.bam"):
        pf = op.basename(bam).split(".")[0]
        outdir = pf + "_cufflinks"
        cmd = "cufflinks"
        cmd += " -o {0}".format(outdir)
        cmd += " -p {0}".format(cpus)
        if gtf:
            cmd += " -g {0}".format(gtf)
        cmd += " --frag-bias-correct {0}".format(reference)
        cmd += " --multi-read-correct"
        cmd += " {0}".format(bam)
        cgtf = op.join(outdir, transcripts)
        mm.add(bam, cgtf, cmd)
        gtfs.append(cgtf)

    assemblylist = "assembly_list.txt"
    cmd = 'find . -name "{0}" > {1}'.format(transcripts, assemblylist)
    mm.add(gtfs, assemblylist, cmd)

    mergedgtf = "merged/merged.gtf"
    cmd = "cuffmerge"
    cmd += " -o merged"
    cmd += " -p {0}".format(cpus)
    if gtf:
        cmd += " -g {0}".format(gtf)
    cmd += " -s {0}".format(reference)
    cmd += " {0}".format(assemblylist)
    mm.add(assemblylist, mergedgtf, cmd)

    mm.write()
Example #31
0
def mcluster(args):
    """
    %prog mcluster *.consensus

    Cluster across samples using consensus sequences.
    """
    p = OptionParser(mcluster.__doc__)
    add_consensus_options(p)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    consensusfiles = args
    minlength = opts.minlength
    cpus = opts.cpus
    pf = opts.prefix
    pctid = find_pctid(consensusfiles)

    pf += ".P{0}".format(pctid)
    consensusfile = pf + ".consensus.fasta"
    if need_update(consensusfiles, consensusfile):
        fw_cons = must_open(consensusfile, "w")
        totalseqs = 0
        for cf in consensusfiles:
            nseqs = 0
            s = op.basename(cf).split(".")[0]
            for name, seq in parse_fasta(cf):
                name = ".".join((s, name))
                print(">{0}\n{1}".format(name, seq), file=fw_cons)
                nseqs += 1
            logging.debug("Read `{0}`: {1} seqs".format(cf, nseqs))
            totalseqs += nseqs
        logging.debug("Total: {0} seqs".format(totalseqs))
        fw_cons.close()

    userfile = pf + ".u"
    notmatchedfile = pf + ".notmatched"
    if need_update(consensusfile, userfile):
        cluster_smallmem(consensusfile, userfile, notmatchedfile, minlength,
                         pctid, cpus)

    clustfile = pf + ".clust"
    if need_update((consensusfile, userfile, notmatchedfile), clustfile):
        makeclust(consensusfile, userfile, notmatchedfile, clustfile)

    clustSfile = pf + ".clustS"
    if need_update(clustfile, clustSfile):
        parallel_musclewrap(clustfile, cpus, minsamp=opts.minsamp)
Example #32
0
File: str.py Project: qiao-xin/jcvi
def compilevcf(args):
    """
    %prog compilevcf samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compilevcf.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.add_option("--stutter", default=False, action="store_true",
                 help="Count stutter reads on chrY")
    p.add_option("--nofilter", default=False, action="store_true",
                 help="Do not filter the variants")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    store = opts.output_path
    stutter = opts.stutter
    cleanup = not opts.nocleanup
    filtered = not opts.nofilter
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    run_args = [(x, filtered, cleanup, store, stutter) for x in vcffiles]
    cpus = min(opts.cpus, len(run_args))
    p = Pool(processes=cpus)
    for res in p.map_async(run_compile, run_args).get():
        continue
Example #33
0
def align(args):
    """
    %prog align clustfile

    Align clustfile to clustSfile. Useful for benchmarking aligners.
    """
    p = OptionParser(align.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clustfile, = args
    parallel_musclewrap(clustfile, opts.cpus)
Example #34
0
def mappability(args):
    """
    %prog mappability reference.fasta

    Generate 50mer mappability for reference genome. Commands are based on gem
    mapper. See instructions:
    <https://github.com/xuefzhao/Reference.Mappability>
    """
    p = OptionParser(mappability.__doc__)
    p.add_option("--mer", default=50, type="int", help="User mer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    ref, = args
    K = opts.mer
    pf = ref.rsplit(".", 1)[0]
    mm = MakeManager()

    gem = pf + ".gem"
    cmd = "gem-indexer -i {} -o {}".format(ref, pf)
    mm.add(ref, gem, cmd)

    mer = pf + ".{}mer".format(K)
    mapb = mer + ".mappability"
    cmd = "gem-mappability -I {} -l {} -o {} -T {}".\
                format(gem, K, mer, opts.cpus)
    mm.add(gem, mapb, cmd)

    wig = mer + ".wig"
    cmd = "gem-2-wig -I {} -i {} -o {}".format(gem, mapb, mer)
    mm.add(mapb, wig, cmd)

    bw = mer + ".bw"
    cmd = "wigToBigWig {} {}.sizes {}".format(wig, mer, bw)
    mm.add(wig, bw, cmd)

    bg = mer + ".bedGraph"
    cmd = "bigWigToBedGraph {} {}".format(bw, bg)
    mm.add(bw, bg, cmd)

    merged = mer + ".filtered-1.merge.bed"
    cmd = "python -m jcvi.formats.bed filterbedgraph {} 1".format(bg)
    mm.add(bg, merged, cmd)

    mm.write()
Example #35
0
def layout(args):
    """
    %prog layout query.subject.simple query.seqids subject.seqids

    Compute optimal seqids order in a second genome, based on seqids on one
    genome, given the pairwise blocks in .simple format.
    """
    from jcvi.algorithms.ec import GA_setup, GA_run

    p = OptionParser(layout.__doc__)
    p.set_beds()
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    simplefile, qseqids, sseqids = args
    qbed, sbed, qorder, sorder, is_self = check_beds(simplefile, p, opts)

    qseqids = qseqids.strip().split(",")
    sseqids = sseqids.strip().split(",")
    qseqids_ii = dict((s, i) for i, s in enumerate(qseqids))
    sseqids_ii = dict((s, i) for i, s in enumerate(sseqids))

    blocks = SimpleFile(simplefile).blocks
    scores = defaultdict(int)
    for a, b, c, d, score, orientation, hl in blocks:
        qi, q = qorder[a]
        si, s = sorder[c]
        qseqid, sseqid = q.seqid, s.seqid
        if sseqid not in sseqids:
            continue
        scores[sseqids_ii[sseqid], qseqid] += score

    data = []
    for (a, b), score in sorted(scores.items()):
        if b not in qseqids_ii:
            continue
        data.append((qseqids_ii[b], score))

    tour = range(len(qseqids))
    toolbox = GA_setup(tour)
    toolbox.register("evaluate", colinear_evaluate_weights, data=data)
    tour, fitness = GA_run(toolbox, ngen=100, npop=100, cpus=opts.cpus)
    tour = [qseqids[x] for x in tour]

    print ",".join(tour)
Example #36
0
def bes(args):
    """
    %prog bes bacfasta clonename

    Use the clone name to download BES gss sequences from Genbank, map and then
    visualize.
    """
    from jcvi.apps.align import run_blat

    p = OptionParser(bes.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    bacfasta, clonename = args

    entrez([clonename, "--database=nucgss", "--skipcheck"])
    besfasta = clonename + ".fasta"
    blatfile = clonename + ".bes.blat"
    run_blat(infile=besfasta, outfile=blatfile, db=bacfasta, \
             pctid=95, hitlen=100, cpus=opts.cpus)

    aid, asize = Fasta(bacfasta).itersizes().next()

    width = 50
    msg = "=" * width
    msg += "  " + aid
    print >> sys.stderr, msg

    ratio = width * 1. / asize
    _ = lambda x: int(round(x * ratio, 0))
    blasts = [BlastLine(x) for x in open(blatfile)]
    for b in blasts:
        if b.orientation == '+':
            msg = " " * _(b.sstart) + "->"
        else:
            msg = " " * (_(b.sstop) - 2) + "<-"
        msg += " " * (width - len(msg) + 2)
        msg += b.query
        if b.orientation == '+':
            msg += " (hang={0})".format(b.sstart - 1)
        else:
            msg += " (hang={0})".format(asize - b.sstop)

        print >> sys.stderr, msg
Example #37
0
def alignextend(args):
    """
    %prog alignextend ref.fasta read.1.fastq read.2.fastq

    Wrapper around AMOS alignextend.
    """
    choices = "prepare,align,filter,rmdup,genreads".split(",")
    p = OptionParser(alignextend.__doc__)
    p.add_option("--nosuffix", default=False, action="store_true",
                 help="Do not add /1/2 suffix to the read [default: %default]")
    p.add_option("--rc", default=False, action="store_true",
                 help="Reverse complement the reads before alignment")
    p.add_option("--len", default=100, type="int",
                 help="Extend to this length")
    p.add_option("--stage", default="prepare", choices=choices,
                 help="Start from certain stage")
    p.add_option("--dup", default=10, type="int",
                 help="Filter duplicates with coordinates within this distance")
    p.add_option("--maxdiff", default=1, type="int",
                 help="Maximum number of differences")
    p.set_home("amos")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    ref, r1, r2 = args
    pf = op.basename(r1).split(".")[0]
    cmd = op.join(opts.amos_home, "src/Experimental/alignextend.pl")
    if not opts.nosuffix:
        cmd += " -suffix"
    bwa_idx = "{0}.ref.fa.sa".format(pf)
    if not need_update(ref, bwa_idx):
        cmd += " -noindex"
    cmd += " -threads {0}".format(opts.cpus)
    offset = guessoffset([r1])
    if offset == 64:
        cmd += " -I"
    if opts.rc:
        cmd += " -rc"
    cmd += " -allow -len {0} -dup {1}".format(opts.len, opts.dup)
    cmd += " -min {0} -max {1}".format(2 * opts.len, 20 * opts.len)
    cmd += " -maxdiff {0}".format(opts.maxdiff)
    cmd += " -stage {0}".format(opts.stage)
    cmd += " ".join(("", pf, ref, r1, r2))
    sh(cmd)
Example #38
0
def gmap(args):
    """
    %prog gmap database.fasta fastafile

    Wrapper for `gmap`.
    """
    p = OptionParser(gmap.__doc__)
    p.add_option("--cross",
                 default=False,
                 action="store_true",
                 help="Cross-species alignment")
    p.add_option(
        "--npaths",
        default=0,
        type="int",
        help="Maximum number of paths to show."
        " If set to 0, prints two paths if chimera"
        " detected, else one.",
    )
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    dbfile, fastafile = args
    assert op.exists(dbfile) and op.exists(fastafile)
    prefix = get_prefix(fastafile, dbfile)
    logfile = prefix + ".log"
    gmapfile = prefix + ".gmap.gff3"

    if not need_update((dbfile, fastafile), gmapfile):
        logging.error("`{0}` exists. `gmap` already run.".format(gmapfile))
    else:
        dbdir, dbname = check_index(dbfile)
        cmd = "gmap -D {0} -d {1}".format(dbdir, dbname)
        cmd += " -f 2 --intronlength=100000"  # Output format 2
        cmd += " -t {0}".format(opts.cpus)
        cmd += " --npaths {0}".format(opts.npaths)
        if opts.cross:
            cmd += " --cross-species"
        cmd += " " + fastafile

        sh(cmd, outfile=gmapfile, errfile=logfile)

    return gmapfile, logfile
Example #39
0
def clean(args):
    """
    %prog clean 1.fastq 2.fastq [insertsize]

    Clean and dedup paired FASTQ files.
    """
    p = OptionParser(clean.__doc__)
    p.add_option("-a",
                 default=0,
                 type="int",
                 help="Trim length at 5' end [default: %default]")
    p.add_option("-b",
                 default=50,
                 type="int",
                 help="Trim length at 3' end [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) == 2:
        p1, p2 = args
        size = get_size(p1)
    elif len(args) == 3:
        p1, p2, size = args
        size = int(size)
    else:
        sys.exit(not p.print_help())

    pf = p1.split(".")[0]
    cpus = opts.cpus

    offset = guessoffset([p1])
    a, b = opts.a, opts.b

    p1_clean = p1 + ".clean"
    p1_cleangz = p1_clean + ".gz"
    p2_clean = p2 + ".clean"
    p2_cleangz = p2_clean + ".gz"
    if need_update([p1, p2], [p1_cleangz, p2_cleangz]):
        cmd = "SOAPfilter_v2.0 -t {0} -m 2000000 -p -y -z -g".format(cpus)
        cmd += " -q {0} -w 10 -B 50 -f 0".format(offset)
        cmd += " -l {0} -a {1} -b {2} -c {1} -d {2}".format(size, a, b, a, b)
        cmd += " {0} {1} {2}.clean.stat {3} {4}".\
                    format(p1, p2, pf, p1_clean, p2_clean)
        sh(cmd)
Example #40
0
def mergebam(args):
    """
    %prog mergebam dir1 dir2 homo_outdir
    or
    %prog mergebam dir1 dir2/20.bam het_outdir

    Merge sets of BAMs to make diploid. Two modes:
    - Homozygous mode: pair-up the bams in the two folders and merge
    - Heterozygous mode: pair the bams in first folder with a particular bam
    """
    p = OptionParser(mergebam.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    idir1, idir2, outdir = args
    dir1 = [idir1] if idir1.endswith(".bam") else iglob(idir1, "*.bam")
    dir2 = [idir2] if idir2.endswith(".bam") else iglob(idir2, "*.bam")
    nbams1 = len(dir1)
    nbams2 = len(dir2)
    # Make sure more or the same number of bams in first pile
    if nbams1 < nbams2:
        dir1, dir2 = dir2, dir1
    if nbams1 == nbams2:
        logging.debug("Homozygous mode")
    elif nbams1 > nbams2:
        assert nbams2 == 1, "Second pile must contain a single bam"
        dir2 = [idir2] * nbams1

    assert len(dir1) == len(dir2), "Two piles must contain same number of bams"
    cmd = "samtools merge {} {} {} && samtools index {}"
    cmds = []
    mkdir(outdir)
    for a, b in zip(dir1, dir2):
        ia = op.basename(a).split(".")[0]
        ib = op.basename(b).split(".")[0]
        outfile = op.join(outdir, "{}_{}.bam".format(ia, ib))
        cmds.append(cmd.format(outfile, a, b, outfile))

    p = Parallel(cmds, cpus=opts.cpus)
    p.run()
Example #41
0
def fill(args):
    """
    %prog fill frag_reads_corr.fastb

    Run FillFragments on `frag_reads_corr.fastb`.
    """
    p = OptionParser(fill.__doc__)
    p.add_option(
        "--stretch",
        default=3,
        type="int",
        help="MAX_STRETCH to pass to FillFragments",
    )
    p.set_cpus()

    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (fastb, ) = args
    assert fastb == "frag_reads_corr.fastb"

    pcfile = "frag_reads_corr.k28.pc.info"
    nthreads = " NUM_THREADS={0}".format(opts.cpus)
    maxstretch = " MAX_STRETCH={0}".format(opts.stretch)
    if need_update(fastb, pcfile):
        cmd = "PathReads READS_IN=frag_reads_corr"
        cmd += nthreads
        sh(cmd)

    filledfastb = "filled_reads.fastb"
    if need_update(pcfile, filledfastb):
        cmd = "FillFragments PAIRS_OUT=frag_reads_corr_cpd"
        cmd += " PRECORRECT_LIBSTATS=True"
        cmd += maxstretch
        cmd += nthreads
        sh(cmd)

    filledfasta = "filled_reads.fasta"
    if need_update(filledfastb, filledfasta):
        cmd = "Fastb2Fasta IN=filled_reads.fastb OUT=filled_reads.fasta"
        sh(cmd)
Example #42
0
def cp(args):
    """
    %prog cp "s3://hli-mv-data-science/htang/str/*.csv" .

    Copy files to folder. Accepts list of s3 addresses as input.
    """
    p = OptionParser(cp.__doc__)
    p.add_option("--force",
                 default=False,
                 action="store_true",
                 help="Force overwrite if exists")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    store, folder = args
    force = opts.force
    cpus = opts.cpus
    if op.exists(store):
        contents = [x.strip().split(",") for x in open(store)]
    else:
        contents = glob_s3(store)

    tasks = []
    for c in contents:
        if isinstance(c, basestring):
            oc = op.basename(c)
            tc = op.join(folder, oc)
        else:
            if len(c) == 2:
                c, tc = c
            else:
                c, = c
                tc = op.basename(c)
        tasks.append((c, tc, force))

    worker_pool = Pool(cpus)
    worker_pool.map(worker, tasks)
    worker_pool.close()
    worker_pool.join()
Example #43
0
def nucmer(args):
    """
    %prog nucmer ref.fasta query.fasta

    Run NUCMER using query against reference. Parallel implementation derived
    from: <https://github.com/fritzsedlazeck/sge_mummer>
    """
    from itertools import product

    from jcvi.apps.grid import MakeManager
    from jcvi.formats.base import split

    p = OptionParser(nucmer.__doc__)
    p.add_option("--chunks",
                 type="int",
                 help="Split both query and subject into chunks")
    p.set_params(prog="nucmer", params="-g 5000 -l 24 -c 500")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    ref, query = args
    cpus = opts.cpus
    nrefs = nqueries = opts.chunks or int(cpus**.5)
    refdir = ref.split(".")[0] + "-outdir"
    querydir = query.split(".")[0] + "-outdir"
    reflist = split([ref, refdir, str(nrefs)]).names
    querylist = split([query, querydir, str(nqueries)]).names

    mm = MakeManager()
    for i, (r, q) in enumerate(product(reflist, querylist)):
        pf = "{0:04d}".format(i)
        cmd = "nucmer -maxmatch"
        cmd += " {0}".format(opts.extra)
        cmd += " {0} {1} -p {2}".format(r, q, pf)
        deltafile = pf + ".delta"
        mm.add((r, q), deltafile, cmd)
        print cmd

    mm.write()
Example #44
0
def blast(args):
    """
    %prog blast ref.fasta query.fasta

    Calls blast and then filter the BLAST hits. Default is megablast.
    """
    task_choices = ("blastn", "blastn-short", "dc-megablast", \
                    "megablast", "vecscreen")
    p = OptionParser(blast.__doc__)
    p.set_align(pctid=0, evalue=.01)
    p.add_option("--wordsize",
                 type="int",
                 help="Word size [default: %default]")
    p.add_option("--best",
                 default=1,
                 type="int",
                 help="Only look for best N hits [default: %default]")
    p.add_option("--task",
                 default="megablast",
                 choices=task_choices,
                 help="Task of the blastn [default: %default]")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta)

    run_megablast(infile=queryfasta,
                  outfile=blastfile,
                  db=reffasta,
                  wordsize=opts.wordsize,
                  pctid=opts.pctid,
                  evalue=opts.evalue,
                  hitlen=None,
                  best=opts.best,
                  task=opts.task,
                  cpus=opts.cpus)

    return blastfile
Example #45
0
def compile(args):
    """
    %prog compile samples.csv

    Compile vcf results into master spreadsheet.
    """
    p = OptionParser(compile.__doc__)
    p.add_option("--db", default="hg38", help="Use these lobSTR db")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    samples, = args
    workdir = opts.workdir
    dbs = opts.db.split(",")
    cwd = os.getcwd()
    mkdir(workdir)
    os.chdir(workdir)
    samples = op.join(cwd, samples)

    stridsfile = "STR.ids"
    vcffiles = [x.strip() for x in must_open(samples)]
    if not op.exists(stridsfile):
        ids = []
        for db in dbs:
            ids.extend(STRFile(opts.lobstr_home, db=db).ids)
        uids = uniqify(ids)
        logging.debug("Combined: {} Unique: {}".format(len(ids), len(uids)))

        fw = open(stridsfile, "w")
        print >> fw, "\n".join(uids)
        fw.close()

    p = Pool(processes=opts.cpus)
    run_args = [(x, opts.store, opts.cleanup) for x in vcffiles]
    for res in p.map_async(run, run_args).get():
        continue
Example #46
0
def close(args):
    """
    %prog close scaffolds.fasta PE*.fastq

    Run GapFiller to fill gaps.
    """
    p = OptionParser(close.__doc__)
    p.set_home("gapfiller")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    scaffolds = args[0]
    libtxt = write_libraries(args[1:], aligner="bwa")

    cmd = "perl " + op.join(opts.gapfiller_home, "GapFiller.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, scaffolds, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #47
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:])

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Basic_v2.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #48
0
def blat(args):
    """
    %prog blat old.fasta new.fasta

    Generate psl file using blat.
    """
    p = OptionParser(blat.__doc__)
    p.add_option(
        "--minscore",
        default=100,
        type="int",
        help="Matches minus mismatches gap penalty",
    )
    p.add_option(
        "--minid",
        default=98,
        type="int",
        help="Minimum sequence identity",
    )
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    oldfasta, newfasta = args
    twobitfiles = []
    for fastafile in args:
        tbfile = faToTwoBit(fastafile)
        twobitfiles.append(tbfile)

    oldtwobit, newtwobit = twobitfiles
    cmd = "pblat -threads={0}".format(opts.cpus) if which("pblat") else "blat"
    cmd += " {0} {1}".format(oldtwobit, newfasta)
    cmd += " -tileSize=12 -minScore={0} -minIdentity={1} ".format(
        opts.minscore, opts.minid)
    pslfile = "{0}.{1}.psl".format(*(op.basename(x).split(".")[0]
                                     for x in (newfasta, oldfasta)))
    cmd += pslfile
    sh(cmd)
Example #49
0
def prepare(args):
    """
    %prog prepare alignAssembly.config est.fasta ref.fasta

    Generate PASA run script.
    """
    p = OptionParser(prepare.__doc__)
    p.set_home("pasa")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 3:
        sys.exit(not p.print_help())

    cfg, est, ref = args
    phome = opts.pasa_home
    cmd = op.join(phome, "scripts/Launch_PASA_pipeline.pl")
    cmd += " -c {0} --CPU {1}".format(cfg, opts.cpus)
    cmd += " -C -R --ALIGNERS blat,gmap"
    cmd += " -t {0} -g {1}".format(est, ref)
    runfile = "run.sh"
    write_file(runfile, cmd, meta="run script")
Example #50
0
def correct(args):
    """
    %prog correct *.fastq

    Correct reads using ErrorCorrection. Only PE will be used to build the K-mer
    table.
    """
    p = OptionParser(correct.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    lstfile = "reads2cor.lst"
    fw = open(lstfile, "w")
    print("\n".join(x for x in args if x[:2] == "PE"), file=fw)
    fw.close()

    p1 = args[0]
    offset = guessoffset([p1])
    cpus = opts.cpus

    freq = "output.freq.cz"
    freqlen = freq + ".len"
    if need_update(args, (freq, freqlen)):
        cmd = "KmerFreq_AR_v2.0 -k 17 -c -1 -q {0}".format(offset)
        cmd += " -m 1 -t {0}".format(cpus)
        cmd += " -p output {0}".format(lstfile)
        sh(cmd)

    fw = open(lstfile, "w")
    print("\n".join(args), file=fw)
    fw.close()

    cmd = "Corrector_AR_v2.0 -k 17 -l 3 -m 5 -c 5 -a 0 -e 1 -w 0 -r 45"
    cmd += " -Q {0} -q 30 -x 8 -t {1} -o 1 ".format(offset, cpus)
    cmd += " {0} {1} {2}".format(freq, freqlen, lstfile)
    sh(cmd)
Example #51
0
def blat(args):
    """
    %prog blat ref.fasta query.fasta

    Calls blat and filters BLAST hits.
    """
    p = OptionParser(blat.__doc__)
    p.set_align(pctid=95, hitlen=30)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    reffasta, queryfasta = args
    blastfile = get_outfile(reffasta, queryfasta, suffix="blat")

    run_blat(infile=queryfasta, outfile=blastfile, db=reffasta,
             pctid=opts.pctid, hitlen=opts.hitlen, cpus=opts.cpus,
             overwrite=False)

    return blastfile
Example #52
0
def kmc(args):
    """
    %prog kmc folder

    Run kmc3 on Illumina reads.
    """
    p = OptionParser(kmc.__doc__)
    p.add_option("-k", default=21, type="int", help="Kmer size")
    p.add_option("--ci", default=2, type="int",
                 help="Minimum value of a counter")
    p.add_option("--cs", default=2, type="int",
                 help="Maximal value of a counter")
    p.add_option("--single", default=False, action="store_true",
                 help="Input is single-end data, only one FASTQ")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    folder, = args
    K = opts.k
    n = 1 if opts.single else 2
    mm = MakeManager()
    for p, pf in iter_project(folder, n=n, commonprefix=False):
        pf = pf.split("_")[0] + ".ms{}".format(K)
        infiles = pf + ".infiles"
        fw = open(infiles, "w")
        print >> fw, "\n".join(p)
        fw.close()

        cmd = "kmc -k{} -m64 -t{}".format(K, opts.cpus)
        cmd += " -ci{} -cs{}".format(opts.ci, opts.cs)
        cmd += " @{} {} .".format(infiles, pf)
        outfile = pf + ".kmc_suf"
        mm.add(p, outfile, cmd)

    mm.write()
Example #53
0
def genemark(args):
    """
    %prog genemark species fastafile

    Train GENEMARK model given fastafile. GENEMARK self-trains so no trainig
    model gff file is needed.
    """
    p = OptionParser(genemark.__doc__)
    p.add_option("--junctions", help="Path to `junctions.bed` from Tophat2")
    p.set_home("gmes")
    p.set_cpus(cpus=32)
    opts, args = p.parse_args(args)

    if len(args) != 2:
        sys.exit(not p.print_help())

    species, fastafile = args
    junctions = opts.junctions
    mhome = opts.gmes_home

    license = op.expanduser("~/.gm_key")
    assert op.exists(license), "License key ({0}) not found!".format(license)
    cmd = "{0}/gmes_petap.pl --sequence {1}".format(mhome, fastafile)
    cmd += " --cores {0}".format(opts.cpus)
    if junctions:
        intronsgff = "introns.gff"
        if need_update(junctions, intronsgff):
            jcmd = "{0}/bet_to_gff.pl".format(mhome)
            jcmd += " --bed {0} --gff {1} --label Tophat2".\
                    format(junctions, intronsgff)
            sh(jcmd)
        cmd += " --ET {0} --et_score 10".format(intronsgff)
    else:
        cmd += " --ES"
    sh(cmd)

    logging.debug("GENEMARK matrix written to `output/gmhmm.mod")
Example #54
0
def deduplicate(args):
    """
    %prog deduplicate fastafile

    Wraps `cd-hit-est` to remove duplicate sequences.
    """
    p = OptionParser(deduplicate.__doc__)
    p.set_align(pctid=98)
    p.add_option("--reads",
                 default=False,
                 action="store_true",
                 help="Use `cd-hit-454` to deduplicate [default: %default]")
    p.add_option("--samestrand",
                 default=False,
                 action="store_true",
                 help="Enforce same strand alignment")
    p.set_home("cdhit")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    fastafile, = args
    identity = opts.pctid / 100.

    cmd = "cd-hit-454" if opts.reads else "cd-hit-est"
    cmd = op.join(opts.cdhit_home, cmd)
    cmd += " -c {0}".format(identity)
    cmd += " -d 0"  # include complete defline
    if opts.samestrand:
        cmd += " -r 0"
    cmd += " -M 0 -T {0} -i {1} -o {1}.cdhit".format(opts.cpus, fastafile)
    sh(cmd)

    dd = fastafile + ".cdhit"
    return dd
Example #55
0
def meryl(args):
    """
    %prog meryl folder

    Run meryl on Illumina reads.
    """
    p = OptionParser(meryl.__doc__)
    p.add_option("-k", default=19, type="int", help="Kmer size")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    (folder, ) = args
    K = opts.k
    cpus = opts.cpus
    mm = MakeManager()
    for p, pf in iter_project(folder):
        cmds = []
        mss = []
        for i, ip in enumerate(p):
            ms = "{}{}.ms{}".format(pf, i + 1, K)
            mss.append(ms)
            cmd = "meryl -B -C -m {} -threads {}".format(K, cpus)
            cmd += " -s {} -o {}".format(ip, ms)
            cmds.append(cmd)
        ams, bms = mss
        pms = "{}.ms{}".format(pf, K)
        cmd = "meryl -M add -s {} -s {} -o {}".format(ams, bms, pms)
        cmds.append(cmd)
        cmd = "rm -f {}.mcdat {}.mcidx {}.mcdat {}.mcidx".format(
            ams, ams, bms, bms)
        cmds.append(cmd)
        mm.add(p, pms + ".mcdat", cmds)

    mm.write()
Example #56
0
def compare(args):
    """
    %prog compare NA12878_array_hg38.bed *.seg

    Compare cnv output to known ground truths.
    """
    p = OptionParser(compare.__doc__)
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    truths = args[0]
    cnvoutputs = args[1:]
    cpus = min(len(cnvoutputs), opts.cpus)
    p = Pool(processes=cpus)
    results = []
    files = [(x, truths) for x in cnvoutputs]
    r = p.map_async(compare_worker, files, callback=results.append)
    r.wait()

    for res in results:
        print("\n".join(res))
Example #57
0
def scaffold(args):
    """
    %prog scaffold contigs.fasta MP*.fastq

    Run SSPACE scaffolding.
    """
    p = OptionParser(scaffold.__doc__)
    p.set_aligner(aligner="bwa")
    p.set_home("sspace")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    contigs = args[0]
    libtxt = write_libraries(args[1:], aligner=opts.aligner)
    # Requires getopts.pl which may be missing
    download("http://web.vims.edu/bridge/bridge2/aw/lib/getopts.pl")

    cmd = "perl " + op.join(opts.sspace_home, "SSPACE_Standard_v3.0.pl")
    cmd += " -l {0} -s {1} -T {2}".format(libtxt, contigs, opts.cpus)
    runsh = "run.sh"
    write_file(runsh, cmd)
Example #58
0
def batchlobstr(args):
    """
    %prog batchlobstr bamlist

    Run lobSTR on a list of BAMs. The corresponding batch command for TREDPARSE:
    $ tred.py --toy bamlist --haploid CHR4 --workdir tredparse_results
    """
    p = OptionParser(batchlobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    bamlist, = args
    cmd = "python -m jcvi.variation.str lobstr TOY"
    cmd += " --input_bam_path {}"
    cmd += " --haploid {}".format(opts.haploid)
    cmds = [cmd.format(x.strip()) for x in open(bamlist).readlines()]
    p = Parallel(cmds, cpus=opts.cpus)
    p.run()
Example #59
0
def density(args):
    """
    %prog density test.clm

    Estimate link density of contigs.
    """
    p = OptionParser(density.__doc__)
    p.add_option("--save",
                 default=False,
                 action="store_true",
                 help="Write log densitites of contigs to file")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) != 1:
        sys.exit(not p.print_help())

    clmfile, = args
    clm = CLMFile(clmfile)
    pf = clmfile.rsplit(".", 1)[0]

    if opts.save:
        logdensities = clm.calculate_densities()
        densityfile = pf + ".density"
        fw = open(densityfile, "w")
        for name, logd in logdensities.items():
            s = clm.tig_to_size[name]
            print >> fw, "\t".join(str(x) for x in (name, s, logd))
        fw.close()
        logging.debug("Density written to `{}`".format(densityfile))

    tourfile = clmfile.rsplit(".", 1)[0] + ".tour"
    tour = clm.activate(tourfile=tourfile, backuptour=False)
    clm.flip_all(tour)
    clm.flip_whole(tour)
    clm.flip_one(tour)
Example #60
0
def jellyfish(args):
    """
    %prog jellyfish [*.fastq|*.fasta]

    Run jellyfish to dump histogram to be used in kmer.histogram().
    """
    from jcvi.apps.base import getfilesize
    from jcvi.utils.cbook import human_size

    p = OptionParser(jellyfish.__doc__)
    p.add_option("-K", default=23, type="int", help="K-mer size")
    p.add_option(
        "--coverage",
        default=40,
        type="int",
        help="Expected sequence coverage",
    )
    p.add_option("--prefix", default="jf", help="Database prefix")
    p.add_option(
        "--nohist",
        default=False,
        action="store_true",
        help="Do not print histogram",
    )
    p.set_home("jellyfish")
    p.set_cpus()
    opts, args = p.parse_args(args)

    if len(args) < 1:
        sys.exit(not p.print_help())

    fastqfiles = args
    K = opts.K
    coverage = opts.coverage

    totalfilesize = sum(getfilesize(x) for x in fastqfiles)
    fq = fastqfiles[0]
    pf = opts.prefix
    gzip = fq.endswith(".gz")

    hashsize = totalfilesize / coverage
    logging.debug("Total file size: {0}, hashsize (-s): {1}".format(
        human_size(totalfilesize, a_kilobyte_is_1024_bytes=True), hashsize))

    jfpf = "{0}-K{1}".format(pf, K)
    jfdb = jfpf
    fastqfiles = " ".join(fastqfiles)

    jfcmd = op.join(opts.jellyfish_home, "jellyfish")
    cmd = jfcmd
    cmd += " count -t {0} -C -o {1}".format(opts.cpus, jfpf)
    cmd += " -s {0} -m {1}".format(hashsize, K)
    if gzip:
        cmd = "gzip -dc {0} | ".format(fastqfiles) + cmd + " /dev/fd/0"
    else:
        cmd += " " + fastqfiles

    if need_update(fastqfiles, jfdb):
        sh(cmd)

    if opts.nohist:
        return

    jfhisto = jfpf + ".histogram"
    cmd = jfcmd + " histo -t 64 {0} -o {1}".format(jfdb, jfhisto)

    if need_update(jfdb, jfhisto):
        sh(cmd)