Exemple #1
0
def write_filtered(vcffile, lhome, store=None):
    if vcffile.startswith("s3://"):
        vcffile = pull_from_s3(vcffile)

    filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf")
    cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome)
    cmd += " --vcf {}".format(vcffile)
    cmd += " --loc-cov 5 --loc-log-score 0.8"
    #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80"
    #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20"
    sh(cmd, outfile=filteredvcf)

    if store:
        push_to_s3(store, filteredvcf)

    return filteredvcf
Exemple #2
0
def write_filtered(vcffile, lhome, store=None):
    if vcffile.startswith("s3://"):
        vcffile = pull_from_s3(vcffile)

    filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf")
    cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome)
    cmd += " --vcf {}".format(vcffile)
    cmd += " --loc-cov 5 --loc-log-score 0.8"
    #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80"
    #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20"
    sh(cmd, outfile=filteredvcf)

    if store:
        push_to_s3(store, filteredvcf)

    return filteredvcf
Exemple #3
0
def write_csv_ev(filename, store, cleanup):
    lv = LobSTRvcf()
    lv.parse(filename, cleanup=cleanup)

    csvfile = op.basename(filename) + ".csv"
    fw = open(csvfile, "w")
    print >> fw, lv.csvline
    fw.close()

    evfile = op.basename(filename) + ".ev"
    fw = open(evfile, "w")
    print >> fw, lv.evline
    fw.close()

    # Save to s3
    push_to_s3(store, csvfile)
    push_to_s3(store, evfile)
Exemple #4
0
def write_csv_ev(filename, store, cleanup):
    lv = LobSTRvcf()
    lv.parse(filename, cleanup=cleanup)

    csvfile = op.basename(filename) + ".csv"
    fw = open(csvfile, "w")
    print >> fw, lv.csvline
    fw.close()

    evfile = op.basename(filename) + ".ev"
    fw = open(evfile, "w")
    print >> fw, lv.evline
    fw.close()

    # Save to s3
    push_to_s3(store, csvfile)
    push_to_s3(store, evfile)
Exemple #5
0
def write_csv_ev(filename, filtered, cleanup, store=None):
    lv = LobSTRvcf()
    lv.parse(filename, filtered=filtered, cleanup=cleanup)
    csvfile = op.basename(filename) + ".csv"
    evfile = op.basename(filename) + ".ev"

    fw = open(csvfile, "w")
    print(lv.csvline, file=fw)
    fw.close()

    fw = open(evfile, "w")
    print(lv.evline, file=fw)
    fw.close()

    # Save to s3
    if store:
        push_to_s3(store, csvfile)
        push_to_s3(store, evfile)
Exemple #6
0
def write_csv_ev(filename, filtered, cleanup, store=None):
    lv = LobSTRvcf()
    lv.parse(filename, filtered=filtered, cleanup=cleanup)
    csvfile = op.basename(filename) + ".csv"
    evfile = op.basename(filename) + ".ev"

    fw = open(csvfile, "w")
    print(lv.csvline, file=fw)
    fw.close()

    fw = open(evfile, "w")
    print(lv.evline, file=fw)
    fw.close()

    # Save to s3
    if store:
        push_to_s3(store, csvfile)
        push_to_s3(store, evfile)
Exemple #7
0
def write_csv_ev(filename, filtered, cleanup, store=None, stutter=False):
    lv = LobSTRvcf()
    lv.parse(filename, filtered=filtered, cleanup=cleanup, stutter=stutter)
    csvfile = op.basename(filename) + ".csv"
    evfile = op.basename(filename) + ".ev"
    if stutter:
        fw = open(evfile, "w")
        print >> fw, lv.evline
        fw.close()
        return

    fw = open(csvfile, "w")
    print >> fw, lv.csvline
    fw.close()

    fw = open(evfile, "w")
    print >> fw, lv.evline
    fw.close()

    # Save to s3
    if store:
        push_to_s3(store, csvfile)
        push_to_s3(store, evfile)
Exemple #8
0
def write_csv_ev(filename, filtered, cleanup, store=None, stutter=False):
    lv = LobSTRvcf()
    lv.parse(filename, filtered=filtered, cleanup=cleanup, stutter=stutter)
    csvfile = op.basename(filename) + ".csv"
    evfile = op.basename(filename) + ".ev"
    if stutter:
        fw = open(evfile, "w")
        print >> fw, lv.evline
        fw.close()
        return

    fw = open(csvfile, "w")
    print >> fw, lv.csvline
    fw.close()

    fw = open(evfile, "w")
    print >> fw, lv.evline
    fw.close()

    # Save to s3
    if store:
        push_to_s3(store, csvfile)
        push_to_s3(store, evfile)
Exemple #9
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--simulation",
                 default=False,
                 action="store_true",
                 help="Simulation mode")
    p.set_home("lobstr",
               default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    if opts.simulation:  # Simulation mode
        cmd, vcf_file = allelotype_on_chr(bamfile,
                                          "chr4",
                                          "/mnt/software/lobSTR/",
                                          "TREDs",
                                          haploid=opts.haploid)
        stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats"
        results_dir = "lobstr_results"
        mkdir(results_dir)
        sh(cmd)
        sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file))
        return

    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile,
                                             chr,
                                             lhome,
                                             lbidx,
                                             haploid=opts.haploid)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Exemple #10
0
def cn(args):
    """
    %prog cn workdir 102340_NA12878 \
        s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/

    Download CCN output folder and convert cib to copy number per 1Kb.
    """
    p = OptionParser(cn.__doc__)
    p.add_option("--binsize",
                 default=1000,
                 type="int",
                 help="Window size along chromosome")
    p.add_option(
        "--cleanup",
        default=False,
        action="store_true",
        help="Clean up downloaded s3 folder",
    )
    p.add_option(
        "--hmm",
        default=False,
        action="store_true",
        help="Run HMM caller after computing CN",
    )
    p.add_option(
        "--upload",
        default="s3://hli-mv-data-science/htang/ccn",
        help="Upload cn and seg results to s3",
    )
    p.add_option("--rebuildgc",
                 help="Rebuild GC directory rather than pulling from S3")
    opts, args = p.parse_args(args)

    if len(args) == 2:
        workdir, sample_key = args
        s3dir = None
    elif len(args) == 3:
        workdir, sample_key, s3dir = args
    else:
        sys.exit(not p.print_help())

    n = opts.binsize
    rebuildgc = opts.rebuildgc
    mkdir(workdir)
    sampledir = op.join(workdir, sample_key)
    if s3dir:
        sync_from_s3(s3dir, target_dir=sampledir)

    assert op.exists(sampledir), "Directory {} doesn't exist!".format(
        sampledir)

    cndir = op.join(workdir, sample_key + "-cn")
    if op.exists(cndir):
        logging.debug("Directory {} exists. Skipped.".format(cndir))
        return

    gcdir = "gc"
    if rebuildgc:
        build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir)
    if not op.exists(gcdir):
        sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc", target_dir=gcdir)

    # Build GC correction table
    gc_bin = defaultdict(list)
    gc_med = {}
    coverage = []

    for seqid in allsomes:
        gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        if not op.exists(gcfile):
            logging.error("File {} not found. Continue anyway.".format(gcfile))
            continue
        gc = np.fromfile(gcfile, dtype=np.uint8)
        cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
        cib = load_cib(cibfile)
        print(seqid, gc.shape[0], cib.shape[0], file=sys.stderr)
        if seqid in autosomes:
            for gci, k in zip(gc, cib):
                gc_bin[gci].append(k)
        coverage.append((seqid, gc, cib))

    for gci, k in gc_bin.items():
        nonzero_k = [x for x in k if x]
        gc_med[gci] = med = np.median(nonzero_k) / 2
        print(gci, len(nonzero_k), med, file=sys.stderr)

    mkdir(cndir)
    apply_fun = np.vectorize(gc_med.get)
    # Apply the GC correction over coverage
    for seqid, gc, cib in coverage:
        nitems = cib.shape[0]
        beta = apply_fun(gc[:nitems])
        beta_cn = cib / beta
        cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid))
        beta_cn.tofile(cnfile)

    # Run HMM caller if asked
    segfile = hmm([workdir, sample_key]) if opts.hmm else None

    upload = opts.upload
    if upload:
        push_to_s3(upload, cndir)
        if segfile:
            push_to_s3(upload, segfile)

    if opts.cleanup:
        import shutil

        shutil.rmtree(sampledir)
        shutil.rmtree(cndir)
Exemple #11
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Exemple #12
0
def cn(args):
    """
    %prog cn workdir 102340_NA12878 \
        s3://hli-bix-us-west-2/kubernetes/wf-root-test/102340_NA12878/lpierce-ccn_gcn-v2/

    Download CCN output folder and convert cib to copy number per 1Kb.
    """
    p = OptionParser(cn.__doc__)
    p.add_option("--binsize", default=1000, type="int",
                 help="Window size along chromosome")
    p.add_option("--cleanup", default=False, action="store_true",
                 help="Clean up downloaded s3 folder")
    p.add_option("--hmm", default=False, action="store_true",
                 help="Run HMM caller after computing CN")
    p.add_option("--upload", default="s3://hli-mv-data-science/htang/ccn",
                 help="Upload cn and seg results to s3")
    p.add_option("--rebuildgc",
                 help="Rebuild GC directory rather than pulling from S3")
    opts, args = p.parse_args(args)

    if len(args) == 2:
        workdir, sample_key = args
        s3dir = None
    elif len(args) == 3:
        workdir, sample_key, s3dir = args
    else:
        sys.exit(not p.print_help())

    n = opts.binsize
    rebuildgc = opts.rebuildgc
    mkdir(workdir)
    sampledir = op.join(workdir, sample_key)
    if s3dir:
        sync_from_s3(s3dir, target_dir=sampledir)

    assert op.exists(sampledir), \
        "Directory {} doesn't exist!".format(sampledir)

    cndir = op.join(workdir, sample_key + "-cn")
    if op.exists(cndir):
        logging.debug("Directory {} exists. Skipped.".format(cndir))
        return

    gcdir = "gc"
    if rebuildgc:
        build_gc_array(fastafile=rebuildgc, n=n, gcdir=gcdir)
    if not op.exists(gcdir):
        sync_from_s3("s3://hli-mv-data-science/htang/ccn/gc",
                     target_dir=gcdir)

    # Build GC correction table
    gc_bin = defaultdict(list)
    gc_med = {}
    coverage = []

    for seqid in allsomes:
        gcfile = op.join(gcdir, "{}.{}.gc".format(seqid, n))
        if not op.exists(gcfile):
            logging.error("File {} not found. Continue anyway.".format(gcfile))
            continue
        gc = np.fromfile(gcfile, dtype=np.uint8)
        cibfile = op.join(sampledir, "{}.{}.cib".format(sample_key, seqid))
        cib = load_cib(cibfile)
        print >> sys.stderr, seqid, gc.shape[0], cib.shape[0]
        if seqid in autosomes:
            for gci, k in zip(gc, cib):
                gc_bin[gci].append(k)
        coverage.append((seqid, gc, cib))

    for gci, k in gc_bin.items():
        nonzero_k = [x for x in k if x]
        gc_med[gci] = med = np.median(nonzero_k) / 2
        print >> sys.stderr, gci, len(nonzero_k), med

    mkdir(cndir)
    apply_fun = np.vectorize(gc_med.get)
    # Apply the GC correction over coverage
    for seqid, gc, cib in coverage:
        nitems = cib.shape[0]
        beta = apply_fun(gc[:nitems])
        beta_cn = cib / beta
        cnfile = op.join(cndir, "{}.{}.cn".format(sample_key, seqid))
        beta_cn.tofile(cnfile)

    # Run HMM caller if asked
    segfile = hmm([workdir, sample_key]) if opts.hmm else None

    upload = opts.upload
    if upload:
        push_to_s3(upload, cndir)
        if segfile:
            push_to_s3(upload, segfile)

    if opts.cleanup:
        import shutil
        shutil.rmtree(sampledir)
        shutil.rmtree(cndir)
Exemple #13
0
def run_mito(
    chrMfa, bamfile, opts, realignonly=False, svonly=False, store=None, cleanup=False
):
    from jcvi.formats.sam import get_minibam

    region = "chrM"
    minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region))
    if not op.exists(minibam):
        get_minibam(bamfile, region)
    else:
        logging.debug("{} found. Skipped.".format(minibam))

    speedseq_bin = op.join(opts.speedseq_home, "speedseq")

    realign = minibam.rsplit(".", 1)[0] + ".realign"
    realignbam = realign + ".bam"
    margs = " -v -t {} -o {}".format(opts.cpus, realign)
    if need_update(minibam, realign + ".bam"):
        cmd = speedseq_bin + " realign"
        cmd += margs
        cmd += " {} {}".format(chrMfa, minibam)
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(realignbam))

    if realignonly:
        return

    depthfile = realign + ".depth"
    if need_update(realignbam, depthfile):
        coverage(
            [
                chrMfa,
                realignbam,
                "--nosort",
                "--format=coverage",
                "--outfile={}".format(depthfile),
            ]
        )

    if store:
        push_to_s3(store, depthfile)

    vcffile = realign + ".sv.vcf.gz"
    if need_update(realignbam, vcffile):
        cmd = speedseq_bin + " sv"
        cmd += margs
        cmd += " -R {}".format(chrMfa)
        cmd += " -m {}".format(opts.support)
        cmd += " -B {} -D {} -S {}".format(
            realignbam, realign + ".discordants.bam", realign + ".splitters.bam"
        )
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(vcffile))

    if store:
        push_to_s3(store, vcffile)

    if svonly:
        if cleanup:
            do_cleanup(minibam, realignbam)
        return

    piledriver = realign + ".piledriver"
    if need_update(realignbam, piledriver):
        cmd = "bamtools piledriver -fasta {}".format(chrMfa)
        cmd += " -in {}".format(realignbam)
        sh(cmd, outfile=piledriver)

    if store:
        push_to_s3(store, piledriver)

    if cleanup:
        do_cleanup(minibam, realignbam)
Exemple #14
0
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")
Exemple #15
0
def run_mito(chrMfa, bamfile, opts, realignonly=False, svonly=False,
             store=None, cleanup=False):
    from jcvi.formats.sam import get_minibam
    region = "chrM"
    minibam = op.basename(bamfile).replace(".bam", ".{}.bam".format(region))
    if not op.exists(minibam):
        get_minibam(bamfile, region)
    else:
        logging.debug("{} found. Skipped.".format(minibam))

    speedseq_bin = op.join(opts.speedseq_home, "speedseq")

    realign = minibam.rsplit(".", 1)[0] + ".realign"
    realignbam = realign + ".bam"
    margs = " -v -t {} -o {}".format(opts.cpus, realign)
    if need_update(minibam, realign + ".bam"):
        cmd = speedseq_bin + " realign"
        cmd += margs
        cmd += " {} {}".format(chrMfa, minibam)
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(realignbam))

    if realignonly:
        return

    depthfile = realign + ".depth"
    if need_update(realignbam, depthfile):
        coverage([chrMfa, realignbam, "--nosort", "--format=coverage",
                  "--outfile={}".format(depthfile)])

    if store:
        push_to_s3(store, depthfile)

    vcffile = realign + ".sv.vcf.gz"
    if need_update(realignbam, vcffile):
        cmd = speedseq_bin + " sv"
        cmd += margs
        cmd += " -R {}".format(chrMfa)
        cmd += " -m {}".format(opts.support)
        cmd += " -B {} -D {} -S {}".format(realignbam,
                        realign + ".discordants.bam", realign + ".splitters.bam")
        sh(cmd)
    else:
        logging.debug("{} found. Skipped.".format(vcffile))

    if store:
        push_to_s3(store, vcffile)

    if svonly:
        if cleanup:
            do_cleanup(minibam, realignbam)
        return

    piledriver = realign + ".piledriver"
    if need_update(realignbam, piledriver):
        cmd = "bamtools piledriver -fasta {}".format(chrMfa)
        cmd += " -in {}".format(realignbam)
        sh(cmd, outfile=piledriver)

    if store:
        push_to_s3(store, piledriver)

    if cleanup:
        do_cleanup(minibam, realignbam)
Exemple #16
0
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")