Ejemplo n.º 1
0
def write_filtered(vcffile, lhome, store=None):
    if vcffile.startswith("s3://"):
        vcffile = pull_from_s3(vcffile)

    filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf")
    cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome)
    cmd += " --vcf {}".format(vcffile)
    cmd += " --loc-cov 5 --loc-log-score 0.8"
    #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80"
    #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20"
    sh(cmd, outfile=filteredvcf)

    if store:
        push_to_s3(store, filteredvcf)

    return filteredvcf
Ejemplo n.º 2
0
def write_filtered(vcffile, lhome, store=None):
    if vcffile.startswith("s3://"):
        vcffile = pull_from_s3(vcffile)

    filteredvcf = op.basename(vcffile).replace(".vcf", ".filtered.vcf")
    cmd = "python {}/scripts/lobSTR_filter_vcf.py".format(lhome)
    cmd += " --vcf {}".format(vcffile)
    cmd += " --loc-cov 5 --loc-log-score 0.8"
    #cmd += " --loc-call-rate 0.8 --loc-max-ref-length 80"
    #cmd += " --call-cov 5 --call-log-score 0.8 --call-dist-end 20"
    sh(cmd, outfile=filteredvcf)

    if store:
        push_to_s3(store, filteredvcf)

    return filteredvcf
Ejemplo n.º 3
0
def must_open(filename,
              mode="r",
              checkexists=False,
              skipcheck=False,
              oappend=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        if filename[0].endswith((".gz", ".bz2")):
            filename = " ".join(
                filename)  # allow opening multiple gz/bz2 files
        else:
            import fileinput

            return fileinput.input(filename)

    if filename.startswith("s3://"):
        from jcvi.utils.aws import pull_from_s3

        filename = pull_from_s3(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile

        fp = NamedTemporaryFile(mode=mode, delete=False)

    elif filename.endswith(".gz"):
        import gzip

        if "r" in mode:
            fp = gzip.open(filename, mode + "t")
        elif "w" in mode:
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if "r" in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif "w" in mode:
            import bz2

            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = ((not op.exists(filename))
                         if skipcheck else check_exists(filename, oappend))
            if overwrite:
                if oappend:
                    fp = open(filename, "a")
                else:
                    fp = open(filename, "w")
            else:
                logging.debug(
                    "File `{0}` already exists. Skipped.".format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
Ejemplo n.º 4
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--haploid",
                 default="chrY,chrM",
                 help="Use haploid model for these chromosomes")
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--simulation",
                 default=False,
                 action="store_true",
                 help="Simulation mode")
    p.set_home("lobstr",
               default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    if opts.simulation:  # Simulation mode
        cmd, vcf_file = allelotype_on_chr(bamfile,
                                          "chr4",
                                          "/mnt/software/lobSTR/",
                                          "TREDs",
                                          haploid=opts.haploid)
        stats_file = vcf_file.rsplit(".", 1)[0] + ".allelotype.stats"
        results_dir = "lobstr_results"
        mkdir(results_dir)
        sh(cmd)
        sh("mv {} {}/ && rm {}".format(vcf_file, results_dir, stats_file))
        return

    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile,
                                             chr,
                                             lhome,
                                             lbidx,
                                             haploid=opts.haploid)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Ejemplo n.º 5
0
def lobstr(args):
    """
    %prog lobstr lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices. In
    addition, bamfile can be S3 location and --lobstr_home can be S3 location
    (e.g. s3://hli-mv-data-science/htang/str-build/lobSTR/)
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.set_home("lobstr", default="s3://hli-mv-data-science/htang/str-build/lobSTR/")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str-data")
    opts, args = p.parse_args(args)
    bamfile = opts.input_bam_path

    if len(args) < 1 or bamfile is None:
        sys.exit(not p.print_help())

    lbindices = args
    s3mode = bamfile.startswith("s3")
    store = opts.output_path
    cleanup = not opts.nocleanup
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    lhome = opts.lobstr_home
    if lhome.startswith("s3://"):
        lhome = pull_from_s3(lhome, overwrite=False)

    exec_id, sample_id = opts.workflow_execution_id, opts.sample_id
    prefix = [x for x in (exec_id, sample_id) if x]
    if prefix:
        pf = "_".join(prefix)
    else:
        pf = bamfile.split("/")[-1].split(".")[0]

    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        makefile = "makefile.{0}".format(lbidx)
        mm = MakeManager(filename=makefile)
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            filteredvcffile = vcffile.replace(".vcf", ".filtered.vcf")
            cmd = "python -m jcvi.variation.str filtervcf {}".format(vcffile)
            cmd += " --lobstr_home {}".format(lhome)
            mm.add(vcffile, filteredvcffile, cmd)
            vcffiles.append(filteredvcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)

        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if cleanup:
        mm.clean()
        sh("rm -f {} {} *.bai *.stats".format(bamfile, mm.makefile))
Ejemplo n.º 6
0
def must_open(filename, mode="r", checkexists=False, skipcheck=False, \
            oappend=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        if filename[0].endswith(".gz") or filename[0].endswith(".bz2"):
            filename = " ".join(filename)  # allow opening multiple gz/bz2 files
        else:
            import fileinput
            return fileinput.input(filename)

    if filename.startswith("s3://"):
        from jcvi.utils.aws import pull_from_s3
        filename = pull_from_s3(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile
        fp = NamedTemporaryFile(delete=False)

    elif filename.endswith(".gz"):
        if 'r' in mode:
            cmd = "zcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import gzip
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if 'r' in mode:
            cmd = "bzcat {0}".format(filename)
            fp = popen(cmd, debug=False)
        elif 'w' in mode:
            import bz2
            fp = bz2.BZ2File(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = (not op.exists(filename)) if skipcheck \
                        else check_exists(filename, oappend)
            if overwrite:
                if oappend:
                    fp = open(filename, "a")
                else:
                    fp = open(filename, "w")
            else:
                logging.debug("File `{0}` already exists. Skipped."\
                        .format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
Ejemplo n.º 7
0
Archivo: str.py Proyecto: Hensonmw/jcvi
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")
Ejemplo n.º 8
0
def must_open(filename, mode="r", checkexists=False, skipcheck=False, \
            oappend=False):
    """
    Accepts filename and returns filehandle.

    Checks on multiple files, stdin/stdout/stderr, .gz or .bz2 file.
    """
    if isinstance(filename, list):
        assert "r" in mode

        if filename[0].endswith((".gz", ".bz2")):
            filename = " ".join(
                filename)  # allow opening multiple gz/bz2 files
        else:
            import fileinput
            return fileinput.input(filename)

    if filename.startswith("s3://"):
        from jcvi.utils.aws import pull_from_s3
        filename = pull_from_s3(filename)

    if filename in ("-", "stdin"):
        assert "r" in mode
        fp = sys.stdin

    elif filename == "stdout":
        assert "w" in mode
        fp = sys.stdout

    elif filename == "stderr":
        assert "w" in mode
        fp = sys.stderr

    elif filename == "tmp" and mode == "w":
        from tempfile import NamedTemporaryFile
        fp = NamedTemporaryFile(delete=False)

    elif filename.endswith(".gz"):
        if "r" in mode:
            cmd = "gunzip -c %s" % filename
            from subprocess import Popen, PIPE
            fp = Popen(cmd,
                       bufsize=1,
                       stdout=PIPE,
                       shell=True,
                       universal_newlines=True).stdout
        elif "w" in mode:
            import gzip
            fp = gzip.open(filename, mode)

    elif filename.endswith(".bz2"):
        if "r" in mode:
            cmd = "bzcat -c %s" % filename
            from subprocess import Popen, PIPE
            fp = Popen(cmd,
                       bufsize=1,
                       stdout=PIPE,
                       shell=True,
                       universal_newlines=True).stdout
        elif "w" in mode:
            import bz2
            fp = bz2.open(filename, mode)

    else:
        if checkexists:
            assert mode == "w"
            overwrite = (not op.exists(filename)) if skipcheck \
                        else check_exists(filename, oappend)
            if overwrite:
                if oappend:
                    fp = open(filename, "a")
                else:
                    fp = open(filename, "w")
            else:
                logging.debug("File `{0}` already exists. Skipped."\
                        .format(filename))
                return None
        else:
            fp = open(filename, mode)

    return fp
Ejemplo n.º 9
0
def lobstr(args):
    """
    %prog lobstr bamfile lobstr_index1 lobstr_index2 ...

    Run lobSTR on a big BAM file. There can be multiple lobSTR indices.
    """
    p = OptionParser(lobstr.__doc__)
    p.add_option("--chr", help="Run only this chromosome")
    p.add_option("--prefix", help="Use prefix file name")
    p.set_home("lobstr")
    p.set_cpus()
    p.set_aws_opts(store="hli-mv-data-science/htang/str")
    opts, args = p.parse_args(args)

    if len(args) < 2:
        sys.exit(not p.print_help())

    bamfile = args[0]
    lbindices = args[1:]
    s3mode = bamfile.startswith("s3")
    store = opts.store
    workdir = opts.workdir
    mkdir(workdir)
    os.chdir(workdir)

    pf = opts.prefix or bamfile.split("/")[-1].split(".")[0]
    if s3mode:
        gzfile = pf + ".{0}.vcf.gz".format(lbindices[-1])
        remotegzfile = "s3://{0}/{1}".format(store, gzfile)
        if check_exists_s3(remotegzfile):
            logging.debug("Object `{0}` exists. Computation skipped."\
                            .format(remotegzfile))
            return
        localbamfile = pf + ".bam"
        localbaifile = localbamfile + ".bai"
        if op.exists(localbamfile):
            logging.debug("BAM file already downloaded.")
        else:
            pull_from_s3(bamfile, localbamfile)
        if op.exists(localbaifile):
            logging.debug("BAM index file already downloaded.")
        else:
            remotebaifile = bamfile + ".bai"
            if check_exists_s3(remotebaifile):
                pull_from_s3(remotebaifile, localbaifile)
            else:
                remotebaifile = bamfile.rsplit(".")[0] + ".bai"
                if check_exists_s3(remotebaifile):
                    pull_from_s3(remotebaifile, localbaifile)
                else:
                    logging.debug("BAM index cannot be found in S3!")
                    sh("samtools index {0}".format(localbamfile))
        bamfile = localbamfile

    lhome = opts.lobstr_home
    chrs = [opts.chr] if opts.chr else (range(1, 23) + ["X", "Y"])
    for lbidx in lbindices:
        mm = MakeManager(filename="makefile.{0}".format(lbidx))
        vcffiles = []
        for chr in chrs:
            cmd, vcffile = allelotype_on_chr(bamfile, chr, lhome, lbidx)
            mm.add(bamfile, vcffile, cmd)
            vcffiles.append(vcffile)

        gzfile = bamfile.split(".")[0] + ".{0}.vcf.gz".format(lbidx)
        cmd = "vcf-concat {0} | vcf-sort".format(" ".join(vcffiles))
        cmd += " | bgzip -c > {0}".format(gzfile)
        mm.add(vcffiles, gzfile, cmd)
        mm.run(cpus=opts.cpus)

        if s3mode:
            push_to_s3(store, gzfile)

    if opts.cleanup:
        sh("rm -f *")