Example #1
0
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam)
            and not utils.file_uptodate(alt_index_file, in_bam)):
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
            else:
                cmd = "{samtools} index {tx_bam_file}"
            do.run(cmd.format(**locals()),
                   "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file,
                                             in_bam) else alt_index_file
Example #2
0
def _run_scalpel_paired(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    This is used for paired tumor / normal samples.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-paired-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            paired = get_paired_bams(align_bams, items)
            if not paired.normal_bam:
                ann_file = _run_scalpel_caller(align_bams, items, ref_file,
                                               assoc_files, region, out_file)
                return ann_file
            vcfstreamsort = config_utils.get_program("vcfstreamsort", config)
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            db_file = os.path.join(tmp_path, "main", "somatic.db")
            if not os.path.exists(db_file + ".dir"):
                if os.path.exists(tmp_path):
                    utils.remove_safe(tmp_path)
                opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
                opts += " --ref {}".format(ref_file)
                opts += " --dir %s" % tmp_path
                # caling
                cl = ("{perl_exports} && "
                      "scalpel-discovery --somatic {opts} --tumor {paired.tumor_bam} --normal {paired.normal_bam}")
                do.run(cl.format(**locals()), "Genotyping paired variants with Scalpel", {})
            # filtering to adjust input parameters
            bed_opts = " ".join(_scalpel_bed_file_opts(items, config, out_file, region, tmp_path))
            use_defaults = True
            if use_defaults:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic.indel.vcf")
            # Uses default filters but can tweak min-alt-count-tumor and min-phred-fisher
            # to swap precision for sensitivity
            else:
                scalpel_tmp_file = os.path.join(tmp_path, "main/somatic-indel-filter.vcf.gz")
                with file_transaction(config, scalpel_tmp_file) as tx_indel_file:
                    cmd = ("{perl_exports} && "
                           "scalpel-export --somatic {bed_opts} --ref {ref_file} --db {db_file} "
                           "--min-alt-count-tumor 5 --min-phred-fisher 10 --min-vaf-tumor 0.1 "
                           "| bgzip -c > {tx_indel_file}")
                    do.run(cmd.format(**locals()), "Scalpel somatic indel filter", {})
            scalpel_tmp_file = bgzip_and_index(scalpel_tmp_file, config)
            scalpel_tmp_file_common = bgzip_and_index(os.path.join(tmp_path, "main/common.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            bcftools_cmd_common = get_scalpel_bcftools_filter_expression("reject", config)
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("vcfcat <({bcftools_cmd_chi2} {scalpel_tmp_file}) "
                   "<({bcftools_cmd_common} {scalpel_tmp_file_common}) | "
                   " {fix_ambig} | {vcfstreamsort} {compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})

    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"), ref_file,
                                               config)
    return ann_file
Example #3
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        assert sambamba, "Did not find sambamba for indexing"
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            try:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
                do.run(cmd.format(**locals()), "Index BAM file with sambamba: %s" % os.path.basename(in_bam))
            except subprocess.CalledProcessError:
                cmd = "{samtools} index {in_bam} {tx_index_file}"
                do.run(cmd.format(**locals()), "Backup single thread index of BAM file with samtools: %s"
                       % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Example #4
0
def index(in_bam, config):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if (not utils.file_uptodate(index_file, in_bam) and
          not utils.file_uptodate(alt_index_file, in_bam)):
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            if sambamba:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
            else:
                cmd = "{samtools} index {tx_bam_file}"
            do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_uptodate(index_file, in_bam) else alt_index_file
def _move_file_with_sizecheck(tx_file, final_file):
    """Move transaction file to final location,
       with size checks avoiding failed transfers.

       Creates an empty file with '.bcbiotmp' extention in the destination
       location, which serves as a flag. If a file like that is present,
       it means that transaction didn't finish successfully.
    """

    logger.debug("Moving %s to %s" % (tx_file, final_file))

    tmp_file = final_file + ".bcbiotmp"
    open(tmp_file, 'wb').close()

    want_size = utils.get_size(tx_file)
    shutil.move(tx_file, final_file)
    transfer_size = utils.get_size(final_file)

    assert want_size == transfer_size, (
        'distributed.transaction.file_transaction: File copy error: '
        'file or directory on temporary storage ({}) size {} bytes '
        'does not equal size of file or directory after transfer to '
        'shared storage ({}) size {} bytes'.format(tx_file, want_size,
                                                   final_file, transfer_size))
    utils.remove_safe(tmp_file)
Example #6
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if not os.path.exists(db):
            if not os.path.exists(compress):
                subprocess.check_call(
                    ["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            shutil.move(os.path.join(kraken, "minikraken_20140330"),
                        os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
    else:
        raise argparse.ArgumentTypeError(
            "kraken not installed in tooldir %s." %
            os.path.join(tooldir, "bin", "kraken"))
Example #7
0
def _move_file_with_sizecheck(tx_file, final_file):
    """Move transaction file to final location,
       with size checks avoiding failed transfers.

       Creates an empty file with '.bcbiotmp' extention in the destination
       location, which serves as a flag. If a file like that is present,
       it means that transaction didn't finish successfully.
    """

    #logger.debug("Moving %s to %s" % (tx_file, final_file))

    tmp_file = final_file + ".bcbiotmp"
    open(tmp_file, 'wb').close()

    want_size = utils.get_size(tx_file)
    shutil.move(tx_file, final_file)
    transfer_size = utils.get_size(final_file)

    assert want_size == transfer_size, (
        'distributed.transaction.file_transaction: File copy error: '
        'file or directory on temporary storage ({}) size {} bytes '
        'does not equal size of file or directory after transfer to '
        'shared storage ({}) size {} bytes'.format(
            tx_file, want_size, final_file, transfer_size)
    )
    utils.remove_safe(tmp_file)
Example #8
0
def _run_workflow(data, workflow_file, work_dir):
    """Run Strelka2 analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"]
    do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data))
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Example #9
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    requests.packages.urllib3.disable_warnings()
    last_mod = urllib.urlopen(url).info().getheader("Last-Modified")
    last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc())
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(db):
            is_new_version = True
        else:
            cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0]
            cur_version = datetime.datetime.utcfromtimestamp(os.path.getmtime(cur_file))
            is_new_version = last_mod.date() > cur_version.date()
            if is_new_version:
                shutil.move(cur_file, cur_file.replace("minikraken", "old"))
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if is_new_version:
            if not os.path.exists(compress):
                subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            last_version = glob.glob(os.path.join(kraken, "minikraken_*"))
            utils.symlink_plus(os.path.join(kraken, last_version[0]), os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
        else:
            print "You have the latest version %s." % last_mod
    else:
        raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." % os.path.join(tooldir, "bin", "kraken"))
Example #10
0
def _run_workflow(data, workflow_file, work_dir):
    """Run Strelka2 analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"]
    do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data))
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Example #11
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(
            index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(
            alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            cmd = "{samtools} index -@ {num_cores} {in_bam} {tx_index_file}"
            do.run(cmd.format(**locals()),
                   "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Example #12
0
def _run_workflow(data, workflow_file, work_dir):
    """Run Strelka2 analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    cmd = [utils.get_program_python("configureStrelkaGermlineWorkflow.py"),
           workflow_file, "-m", "local", "-j", dd.get_num_cores(data), "--quiet"]
    do.run(cmd, "Run Strelka2: %s" % dd.get_sample_name(data))
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Example #13
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    data = paired.tumor_data if paired else items[0]
    cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data)]
    do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Example #14
0
def remove(in_bam):
    """
    remove bam file and the index if exists
    """
    if utils.file_exists(in_bam):
        utils.remove_safe(in_bam)
    if utils.file_exists(in_bam + ".bai"):
        utils.remove_safe(in_bam + ".bai")
Example #15
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    data = paired.tumor_data if paired else items[0]
    cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data)]
    do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
Example #16
0
def remove(in_bam):
    """
    remove bam file and the index if exists
    """
    if utils.file_exists(in_bam):
        utils.remove_safe(in_bam)
    if utils.file_exists(in_bam + ".bai"):
        utils.remove_safe(in_bam + ".bai")
Example #17
0
def _run_scalpel_caller(align_bams,
                        items,
                        ref_file,
                        assoc_files,
                        region=None,
                        out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(
                tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(
                _scalpel_options_from_config(items, config, out_file, region,
                                             tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = (
                "{perl_exports} && "
                "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} "
            )
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(
                os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression(
                "chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = (
                "{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Example #18
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    data = paired.tumor_data if paired else items[0]
    out_file = os.path.join(work_dir, "results", "variants",
                            "somaticSV.vcf.gz" if paired and paired.normal_bam else "diploidSV.vcf.gz")
    if not utils.file_exists(out_file):
        utils.remove_safe(os.path.join(work_dir, "workspace"))
        cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data),
               "--quiet"]
        do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    return out_file
Example #19
0
def _grabix_index(data):
    """Create grabix index of bgzip input file.

    grabix does not allow specification of output file, so symlink the original
    file into a transactional directory.
    """
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = in_file + ".gbi"
    if tz.get_in(["algorithm", "align_split_size"], config) is not False:
        if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file):
            utils.remove_safe(gbi_file)
            with file_transaction(data, gbi_file) as tx_gbi_file:
                tx_in_file = os.path.splitext(tx_gbi_file)[0]
                utils.symlink_plus(in_file, tx_in_file)
                do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file))
        assert utils.file_exists(gbi_file)
    return [gbi_file]
Example #20
0
def tabix_index(in_file, config, preset=None, tabix_args=None):
    """Index a file using tabix.
    """
    in_file = os.path.abspath(in_file)
    out_file = in_file + ".tbi"
    if not utils.file_exists(out_file) or not utils.file_uptodate(out_file, in_file):
        # Remove old index files to prevent linking into tx directory
        utils.remove_safe(out_file)
        with file_transaction(config, out_file) as tx_out_file:
            tabix = tools.get_tabix_cmd(config)
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            if tabix_args:
                cmd = "{tabix} -f {tabix_args} {tx_in_file}"
            else:
                preset = _guess_preset(in_file) if preset is None else preset
                cmd = "{tabix} -f -p {preset} {tx_in_file}"
            do.run(cmd.format(**locals()), "tabix index %s" % os.path.basename(in_file))
    return out_file
Example #21
0
def _run_workflow(items, paired, workflow_file, work_dir):
    """Run manta analysis inside prepared workflow directory.
    """
    data = paired.tumor_data if paired else items[0]
    if paired:
        if paired.normal_bam:
            base_file = "somaticSV.vcf.gz"
        else:
            base_file = "tumorSV.vcf.gz"
    else:
        base_file = "diploidSV.vcf.gz"
    out_file = os.path.join(work_dir, "results", "variants", base_file)
    if not utils.file_exists(out_file):
        utils.remove_safe(os.path.join(work_dir, "workspace"))
        cmd = [sys.executable, workflow_file, "-m", "local", "-j", dd.get_num_cores(data),
               ]
        do.run(cmd, "Run manta SV analysis")
    utils.remove_safe(os.path.join(work_dir, "workspace"))
    return out_file
Example #22
0
def _grabix_index(data):
    """Create grabix index of bgzip input file.

    grabix does not allow specification of output file, so symlink the original
    file into a transactional directory.
    """
    in_file = data["bgzip_file"]
    config = data["config"]
    grabix = config_utils.get_program("grabix", config)
    gbi_file = in_file + ".gbi"
    # We always build grabix input so we can use it for counting reads and doing downsampling
    if not utils.file_exists(gbi_file) or _is_partial_index(gbi_file):
        utils.remove_safe(gbi_file)
        with file_transaction(data, gbi_file) as tx_gbi_file:
            tx_in_file = os.path.splitext(tx_gbi_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            do.run([grabix, "index", tx_in_file], "Index input with grabix: %s" % os.path.basename(in_file))
    assert utils.file_exists(gbi_file)
    return [gbi_file]
Example #23
0
def _run_scalpel_caller(align_bams, items, ref_file, assoc_files,
                          region=None, out_file=None):
    """Detect indels with Scalpel.

    Single sample mode.
    """
    config = items[0]["config"]
    if out_file is None:
        out_file = "%s-variants.vcf.gz" % os.path.splitext(align_bams[0])[0]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            if len(align_bams) > 1:
                message = ("Scalpel does not currently support batch calling!")
                raise ValueError(message)
            input_bams = " ".join("%s" % x for x in align_bams)
            tmp_path = "%s-scalpel-work" % utils.splitext_plus(out_file)[0]
            tx_tmp_path = "%s-scalpel-work" % utils.splitext_plus(tx_out_file)[0]
            if os.path.exists(tmp_path):
                utils.remove_safe(tmp_path)
            opts = " ".join(_scalpel_options_from_config(items, config, out_file, region, tmp_path))
            opts += " --dir %s" % tx_tmp_path
            min_cov = "3"  # minimum coverage
            opts += " --mincov %s" % min_cov
            perl_exports = utils.get_perl_exports(os.path.dirname(tx_out_file))
            cmd = ("{perl_exports} && "
                   "scalpel-discovery --single {opts} --ref {ref_file} --bam {input_bams} ")
            do.run(cmd.format(**locals()), "Genotyping with Scalpel", {})
            shutil.move(tx_tmp_path, tmp_path)
            # parse produced variant file further
            scalpel_tmp_file = bgzip_and_index(os.path.join(tmp_path, "variants.indel.vcf"), config)
            compress_cmd = "| bgzip -c" if out_file.endswith("gz") else ""
            bcftools_cmd_chi2 = get_scalpel_bcftools_filter_expression("chi2", config)
            sample_name_str = items[0]["name"][1]
            fix_ambig = vcfutils.fix_ambiguous_cl()
            cl2 = ("{bcftools_cmd_chi2} {scalpel_tmp_file} | "
                   r"sed 's/FORMAT\tsample\(_name\)\{{0,1\}}/FORMAT\t{sample_name_str}/g' "
                   "| {fix_ambig} | vcfallelicprimitives -t DECOMPOSED --keep-geno | vcffixup - | vcfstreamsort "
                   "{compress_cmd} > {tx_out_file}")
            do.run(cl2.format(**locals()), "Finalising Scalpel variants", {})
    ann_file = annotation.annotate_nongatk_vcf(out_file, align_bams,
                                               assoc_files.get("dbsnp"),
                                               ref_file, config)
    return ann_file
Example #24
0
def tabix_index(in_file, config, preset=None, tabix_args=None):
    """Index a file using tabix.
    """
    in_file = os.path.abspath(in_file)
    out_file = in_file + ".tbi"
    if not utils.file_exists(out_file) or not utils.file_uptodate(out_file, in_file):
        # Remove old index files to prevent linking into tx directory
        utils.remove_safe(out_file)
        with file_transaction(config, out_file) as tx_out_file:
            tabix = tools.get_tabix_cmd(config)
            tx_in_file = os.path.splitext(tx_out_file)[0]
            utils.symlink_plus(in_file, tx_in_file)
            if tabix_args:
                cmd = "{tabix} -f {tabix_args} {tx_in_file}"
            else:
                preset = _guess_preset(in_file) if preset is None else preset
                cmd = "{tabix} -f -p {preset} {tx_in_file}"
            do.run(cmd.format(**locals()), "tabix index %s" % os.path.basename(in_file))
    return out_file
Example #25
0
def _rename_allelic_fraction_field(orig_file,config):
    """Rename allelic fraction field in mutect output
       from FA to FREQ to standarize with other tools
    """
    tmp_file = orig_file.replace(".vcf.gz","-fix.vcf")
    with file_transaction(tmp_file) as tx_in_file:       
        with open(tx_in_file,'w') as out_handle:
            with open_gzipsafe(orig_file) as handle:
                for line in handle:
                    if line.startswith("##FORMAT=<ID=FA"):
                        line = line.replace("=FA","=FREQ")
                    if not line.startswith("#"):
                        line = line.replace("FA","FREQ")
                    out_handle.write(line)
    out_file = orig_file.replace(".gz","")
    remove_safe(orig_file)
    shutil.move(tmp_file, out_file )
    with open(tmp_file, "w") as out_handle:
        out_handle.write("Moved to {0}".format(out_file))
    out_file = bgzip_and_index(out_file, config)
Example #26
0
def _rename_allelic_fraction_field(orig_file, config):
    """Rename allelic fraction field in mutect output
       from FA to FREQ to standarize with other tools
    """
    tmp_file = orig_file.replace(".vcf.gz", "-fix.vcf")
    with file_transaction(config, tmp_file) as tx_in_file:
        with open(tx_in_file, 'w') as out_handle:
            with open_gzipsafe(orig_file) as handle:
                for line in handle:
                    if line.startswith("##FORMAT=<ID=FA"):
                        line = line.replace("=FA", "=FREQ")
                    if not line.startswith("#"):
                        line = line.replace("FA", "FREQ")
                    out_handle.write(line)
    out_file = orig_file.replace(".gz", "")
    remove_safe(orig_file)
    shutil.move(tmp_file, out_file)
    with open(tmp_file, "w") as out_handle:
        out_handle.write("Moved to {0}".format(out_file))
    out_file = bgzip_and_index(out_file, config)
Example #27
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    import requests
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    requests.packages.urllib3.disable_warnings()
    last_mod = urllib.request.urlopen(url).info().get('Last-Modified')
    last_mod = dateutil.parser.parse(last_mod).astimezone(dateutil.tz.tzutc())
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(db):
            is_new_version = True
        else:
            cur_file = glob.glob(os.path.join(kraken, "minikraken_*"))[0]
            cur_version = datetime.datetime.utcfromtimestamp(
                os.path.getmtime(cur_file))
            is_new_version = last_mod.date() > cur_version.date()
            if is_new_version:
                shutil.move(cur_file, cur_file.replace('minikraken', 'old'))
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if is_new_version:
            if not os.path.exists(compress):
                subprocess.check_call(
                    ["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            last_version = glob.glob(os.path.join(kraken, "minikraken_*"))
            utils.symlink_plus(os.path.join(kraken, last_version[0]),
                               os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
        else:
            print("You have the latest version %s." % last_mod)
    else:
        raise argparse.ArgumentTypeError(
            "kraken not installed in tooldir %s." %
            os.path.join(tooldir, "bin", "kraken"))
def tx_tmpdir(data=None, base_dir=None, remove=True):
    """Context manager to create and remove a transactional temporary directory.

    Handles creating a transactional directory for running commands in. Will
    use either the current directory or /tmp/bcbiotx.

    Creates an intermediary location and time specific directory for global
    temporary directories to prevent collisions.

    data can be the full world information object being process or a
    configuration dictionary.
    """
    base_dir = base_dir or os.getcwd()
    tmpdir_base = utils.get_abspath(_get_base_tmpdir(data, base_dir))
    utils.safe_makedir(tmpdir_base)
    tmp_dir = tempfile.mkdtemp(dir=tmpdir_base)
    logger.debug("Created tmp dir %s " % tmp_dir)
    try:
        yield tmp_dir
    finally:
        if remove:
            utils.remove_safe(tmp_dir)
Example #29
0
def _install_kraken_db(datadir, args):
    """Install kraken minimal DB in genome folder.
    """
    kraken = os.path.join(datadir, "genomes/kraken")
    url = "https://ccb.jhu.edu/software/kraken/dl/minikraken.tgz"
    compress = os.path.join(kraken, os.path.basename(url))
    base, ext = utils.splitext_plus(os.path.basename(url))
    db = os.path.join(kraken, base)
    tooldir = args.tooldir or get_defaults()["tooldir"]
    if os.path.exists(os.path.join(tooldir, "bin", "kraken")):
        if not os.path.exists(kraken):
            utils.safe_makedir(kraken)
        if not os.path.exists(db):
            if not os.path.exists(compress):
                subprocess.check_call(["wget", "-O", compress, url, "--no-check-certificate"])
            cmd = ["tar", "-xzvf", compress, "-C", kraken]
            subprocess.check_call(cmd)
            shutil.move(os.path.join(kraken, "minikraken_20140330"), os.path.join(kraken, "minikraken"))
            utils.remove_safe(compress)
    else:
        raise argparse.ArgumentTypeError("kraken not installed in tooldir %s." %
                                         os.path.join(tooldir, "bin", "kraken"))
def _move_tmp_files(safe, orig):
    exts = {
        ".vcf": ".idx",
        ".bam": ".bai",
        ".vcf.gz": ".tbi",
        ".bed.gz": ".tbi"
    }

    utils.safe_makedir(os.path.dirname(orig))
    # If we are rolling back a directory and it already exists
    # this will avoid making a nested set of directories
    if os.path.isdir(orig) and os.path.isdir(safe):
        utils.remove_safe(orig)

    _move_file_with_sizecheck(safe, orig)
    # Move additional, associated files in the same manner
    for check_ext, check_idx in exts.iteritems():
        if not safe.endswith(check_ext):
            continue
        safe_idx = safe + check_idx
        if os.path.exists(safe_idx):
            _move_file_with_sizecheck(safe_idx, orig + check_idx)
Example #31
0
def tx_tmpdir(data=None, base_dir=None, remove=True):
    """Context manager to create and remove a transactional temporary directory.

    Handles creating a transactional directory for running commands in. Will
    use either the current directory or /tmp/bcbiotx.

    Creates an intermediary location and time specific directory for global
    temporary directories to prevent collisions.

    data can be the full world information object being process or a
    configuration dictionary.
    """
    base_dir = base_dir or os.getcwd()
    tmpdir_base = utils.get_abspath(_get_base_tmpdir(data, base_dir))
    utils.safe_makedir(tmpdir_base)
    tmp_dir = tempfile.mkdtemp(dir=tmpdir_base)
    #logger.debug("Created tmp dir %s " % tmp_dir)
    try:
        yield tmp_dir
    finally:
        if remove:
            utils.remove_safe(tmp_dir)
Example #32
0
def _move_tmp_files(safe, orig):
    exts = {
        ".vcf": ".idx",
        ".bam": ".bai",
        ".vcf.gz": ".tbi",
        ".bed.gz": ".tbi"
    }

    utils.safe_makedir(os.path.dirname(orig))
    # If we are rolling back a directory and it already exists
    # this will avoid making a nested set of directories
    if os.path.isdir(orig) and os.path.isdir(safe):
        utils.remove_safe(orig)

    _move_file_with_sizecheck(safe, orig)
    # Move additional, associated files in the same manner
    for check_ext, check_idx in exts.items():
        if not safe.endswith(check_ext):
            continue
        safe_idx = safe + check_idx
        if os.path.exists(safe_idx):
            _move_file_with_sizecheck(safe_idx, orig + check_idx)
Example #33
0
def run(name, chip_bam, rep_bam, input_bam, gtf_file, out_dir, rlength, rpair, config):
    """
    Run rmats for muatant and control samples avoiding
    errors due to samples.
    """
    # output file name need to have the caller name
    MATS_output = os.path.join(out_dir, name + "_MATS_output")
    MATS_dir = os.path.join(out_dir, "MATS_output")
    rmats_file = os.path.join(out_dir, "summary.txt")
    out_file = os.path.join(out_dir, name + "_summary.txt")
    '''myCmd = 'samtools view '+chip_bam+' | head -n 1'
    status,output=commands.getstatusoutput(myCmd)
    rlength=len(output.strip().split('\t')[9])'''
    libType = _get_stranded_flag(config)
    if rep_bam != "":
            chip_bam = chip_bam + "," + rep_bam
    if utils.file_exists(out_file):
        return out_file
    rmats = config_utils.get_program("rmats", config)
    options = " ".join(config_utils.get_resources("rmats", config).get("options", ""))
    with utils.chdir(out_dir):
        cmd = _rmats_cmd()
        try:
            do.run(cmd.format(**locals()), "rmats for %s" % name)
            utils.move_safe(rmats_file, out_file)
            utils.move_safe(MATS_dir, MATS_output)
            repdir_dir = os.path.join(out_dir,"SAMPLE_1")
            utils.remove_safe(repdir_dir)
            repdir_dir = os.path.join(out_dir,"SAMPLE_2")
            utils.remove_safe(repdir_dir)
            print repdir_dir
        except subprocess.CalledProcessError:
            raise RuntimeWarning("rMATS terminated with an error.\n"
                                 "Please, check the message and report "
                                 "error if it is related to bcbio.\n"
                                 "You can add specific options for the sample "
                                 "setting resources as explained in docs: "
                                 "https://bcbio-nextgen.readthedocs.org/en/latest/contents/configuration.html#sample-specific-resources")
    return (out_file)
Example #34
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            cmd = "{samtools} index -@ {num_cores} {in_bam} {tx_index_file}"
            do.run(cmd.format(**locals()), "Index BAM file: %s" % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file
Example #35
0
def index(in_bam, config, check_timestamp=True):
    """Index a BAM file, skipping if index present.

    Centralizes BAM indexing providing ability to switch indexing approaches.
    """
    assert is_bam(in_bam), "%s in not a BAM file" % in_bam
    index_file = "%s.bai" % in_bam
    alt_index_file = "%s.bai" % os.path.splitext(in_bam)[0]
    if check_timestamp:
        bai_exists = utils.file_uptodate(
            index_file, in_bam) or utils.file_uptodate(alt_index_file, in_bam)
    else:
        bai_exists = utils.file_exists(index_file) or utils.file_exists(
            alt_index_file)
    if not bai_exists:
        # Remove old index files and re-run to prevent linking into tx directory
        for fname in [index_file, alt_index_file]:
            utils.remove_safe(fname)
        sambamba = _get_sambamba(config)
        assert sambamba, "Did not find sambamba for indexing"
        samtools = config_utils.get_program("samtools", config)
        num_cores = config["algorithm"].get("num_cores", 1)
        with file_transaction(config, index_file) as tx_index_file:
            assert tx_index_file.find(".bam.bai") > 0
            tx_bam_file = tx_index_file.replace(".bam.bai", ".bam")
            utils.symlink_plus(in_bam, tx_bam_file)
            try:
                cmd = "{sambamba} index -t {num_cores} {tx_bam_file}"
                do.run(
                    cmd.format(**locals()),
                    "Index BAM file with sambamba: %s" %
                    os.path.basename(in_bam))
            except subprocess.CalledProcessError:
                cmd = "{samtools} index {in_bam} {tx_index_file}"
                do.run(
                    cmd.format(**locals()),
                    "Backup single thread index of BAM file with samtools: %s"
                    % os.path.basename(in_bam))
    return index_file if utils.file_exists(index_file) else alt_index_file