Esempio n. 1
0
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning(
            "``fusion_mode`` is deprecated in favor of turning on "
            "callers with ``fusion_caller``. It will run pizzly and "
            "oncofuse for now, but will eventually have support "
            "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {
                "fasta":
                os.path.join(pizzly_dir,
                             "%s.fusions.fasta" % dd.get_sample_name(data)),
                "json":
                os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))
            }
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]
Esempio n. 2
0
def _find_mirge(data):
    try:
        mirge = config_utils.get_program("miRge2.0", data)
        return mirge
    except config_utils.CmdNotFound:
        logger.warning("miRge2.0 is not found. Install it first, and try again.")
    return None
Esempio n. 3
0
def _mirtop(input_fn, sps, db, out_dir, config):
    """
    Convert to GFF3 standard format
    """
    hairpin = os.path.join(db, "hairpin.fa")
    gtf = os.path.join(db, "mirbase.gff3")
    if not file_exists(hairpin) or not file_exists(gtf):
        logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf))
        return None
    out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0]
    out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0]
    export = _get_env()
    cmd = ("{export} mirtop gff  --sps {sps} --hairpin {hairpin} "
           "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}")
    if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \
       not file_exists(os.path.join(out_dir, out_gff_fn)):
        with tx_tmpdir() as out_tx:
            do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn)
            with utils.chdir(out_tx):
                out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \
                                    else out_gff_fn
                if utils.file_exists(out_fn):
                    shutil.move(os.path.join(out_tx, out_fn),
                                os.path.join(out_dir, out_fn))
    out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \
                        else os.path.join(out_dir, out_gff_fn)
    if utils.file_exists(os.path.join(out_dir, out_fn)):
        return os.path.join(out_dir, out_fn)
Esempio n. 4
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"]["variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(data)
        clean_cov_bed = clean_file(dd.get_coverage(data), data, prefix="cov-", simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning("Can't run Seq2C without a svregions or variant_regions BED file")
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
Esempio n. 5
0
def clean_inputs(data):
    """Clean BED input files to avoid overlapping segments that cause downstream issues.

    Per-merges inputs to avoid needing to call multiple times during later parallel steps.
    """
    if not utils.get_in(data, ("config", "algorithm", "variant_regions_orig")):
        data["config"]["algorithm"][
            "variant_regions_orig"] = dd.get_variant_regions(data)
    clean_vr = clean_file(dd.get_variant_regions(data), data)
    merged_vr = merge_overlaps(clean_vr, data)
    data["config"]["algorithm"]["variant_regions"] = clean_vr
    data["config"]["algorithm"]["variant_regions_merged"] = merged_vr

    if dd.get_coverage(data):
        if not utils.get_in(data, ("config", "algorithm", "coverage_orig")):
            data["config"]["algorithm"]["coverage_orig"] = dd.get_coverage(
                data)
        clean_cov_bed = clean_file(dd.get_coverage(data),
                                   data,
                                   prefix="cov-",
                                   simple=True)
        merged_cov_bed = merge_overlaps(clean_cov_bed, data)
        data["config"]["algorithm"]["coverage"] = clean_cov_bed
        data["config"]["algorithm"]["coverage_merged"] = merged_cov_bed

    if 'seq2c' in get_svcallers(data):
        seq2c_ready_bed = prep_seq2c_bed(data)
        if not seq2c_ready_bed:
            logger.warning(
                "Can't run Seq2C without a svregions or variant_regions BED file"
            )
        else:
            data["config"]["algorithm"]["seq2c_bed_ready"] = seq2c_ready_bed
    return data
Esempio n. 6
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar(
            "bcbio.coverage",
            config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning(
            "No coverage calculations: Did not find bcbio.coverage jar from system config"
        )
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file),
                                              "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + [
        "-jar", bc_jar, "multicompare", config_file, out_file, "-c",
        str(config["algorithm"]["num_cores"])
    ]
    do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 7
0
def _get_samples_to_process(fn, out_dir, config, force_single):
    """parse csv file with one line per file. It will merge
    all files that have the same description name"""
    out_dir = os.path.abspath(out_dir)
    samples = defaultdict(list)
    with open(fn) as handle:
        for l in handle:
            cols = l.strip().split(",")
            if len(cols) > 0:
                if len(cols) < 2:
                    raise ValueError("Line needs 2 values: file and name.")
                if utils.file_exists(cols[0]) or is_gsm(cols[0]):
                    if cols[0].find(" ") > -1:
                        new_name = os.path.abspath(cols[0].replace(" ", "_"))
                        logger.warning("Space finds in %s. Linked to %s." % (cols[0], new_name))
                        logger.warning("Please, avoid names with spaces in the future.")
                        utils.symlink_plus(os.path.abspath(cols[0]), new_name)
                        cols[0] = new_name
                    samples[cols[1]].append(cols)
                else:
                    logger.info("skipping %s, File doesn't exist." % cols[0])
    for sample, items in samples.items():
        if is_fastq(items[0][0], True):
            fn = "fq_merge"
            ext = ".fastq.gz"
        elif is_bam(items[0][0]):
            fn = "bam_merge"
            ext = ".bam"
        elif is_gsm(items[0][0]):
            fn = "query_gsm"
            ext = ".fastq.gz"
        files = [os.path.abspath(fn_file[0]) if not is_gsm(fn_file[0]) else fn_file[0] for fn_file in items]
        samples[sample] = [{'files': _check_paired(files, force_single), 'out_file': os.path.join(out_dir, sample + ext), 'fn': fn, 'anno': items[0][2:], 'config': config, 'name': sample, 'out_dir': out_dir}]
    return [samples[sample] for sample in samples]
Esempio n. 8
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config")
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {"direction": "increase",
                                            "magnitude": config["algorithm"].get("num_cores", 1)}
    jvm_opts = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = ["-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"]
            cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file,
                                                     tx_out_file, "-c", str(config["algorithm"].get("num_cores", 1))]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 9
0
def detect_fusions(samples):
    """Run fusion with a standalone tool, specified in config
    as fusion_caller.
    If fusion_mode is True, and no fusion_caller is specified,
    or fusion_caller == 'aligner', it is assumed that gene fusion
    detection was run on the alignment step.
    """
    fusion_mode = dd.get_in_samples(samples, dd.get_fusion_mode)
    if not fusion_mode:
        return samples

    caller = dd.get_in_samples(samples, dd.get_fusion_caller)
    if not caller or caller == 'aligner':
        logger.info("No standalone fusion caller specified in the config.")
        return samples

    STANDALONE_CALLERS = {
        'ericscript': ericscript.run,
    }
    caller_fn = STANDALONE_CALLERS.get(caller)
    if not caller_fn:
        logger.warning("Gene fusion detection with %s is not supported."
                       "Supported callers:\n%s" %
                       ', '.join(STANDALONE_CALLERS.keys()))
        return samples

    logger.info("Running gene fusion detection with  %s" % caller)
    return [[caller_fn(s)] for s in dd.sample_data_iterator(samples)]
Esempio n. 10
0
def run(bam_file, data, out_dir):
    out_base = os.path.join(utils.safe_makedir(out_dir),
                            "%s-verifybamid" % (dd.get_sample_name(data)))
    out_file = out_base + ".selfSM"
    failed_file = out_base + ".failed"
    exts = [".out"]
    out = {}
    if not utils.file_exists(out_file) and not utils.file_exists(failed_file):
        with file_transaction(data, out_base) as tx_out_base:
            cmd = [
                "verifybamid2", "1000g.phase3", "100k",
                "b38" if dd.get_genome_build(data) == "hg38" else "b37",
                "--Reference",
                dd.get_ref_file(data), "--Output", tx_out_base,
                "--DisableSanityCheck"
            ]
            cmd += _get_input_args(bam_file, data, out_base)
            try:
                do.run(cmd, "VerifyBamID contamination checks")
            except subprocess.CalledProcessError, msg:

                def allowed_errors(l):
                    return (
                        l.find("Insufficient Available markers") >= 0
                        or l.find("No reads found in any of the regions") >= 0)

                if any([allowed_errors(l) for l in str(msg).split("\n")]):
                    logger.info(
                        "Skipping VerifyBamID, not enough overlapping markers found: %s"
                        % dd.get_sample_name(data))
                    with open(failed_file, "w") as out_handle:
                        out_handle.write(str(msg))
                else:
                    logger.warning(str(msg))
                    raise
            else:
                # Fix any sample name problems, for pileups
                shutil.move(tx_out_base + ".selfSM",
                            tx_out_base + ".selfSM.orig")
                with open(tx_out_base + ".selfSM.orig") as in_handle:
                    with open(tx_out_base + ".selfSM", "w") as out_handle:
                        sample_name = None
                        for line in in_handle:
                            if line.startswith("DefaultSampleName"):
                                line = line.replace("DefaultSampleName",
                                                    dd.get_sample_name(data))
                            # work around bug in finding SM from BAM RG at end of line
                            if len(line.strip().split("\t")) == 1:
                                sample_name = line.strip()
                                line = None
                            elif sample_name:
                                parts = line.split("\t")
                                parts[0] = sample_name
                                line = "\t".join(parts)
                                sample_name = None
                            if line:
                                out_handle.write(line)
                for e in exts + [".selfSM"]:
                    if os.path.exists(tx_out_base + e):
                        shutil.copy(tx_out_base + e, out_base + e)
Esempio n. 11
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = clean_ATAC(data)
    # for ATAC-seq, this will be the NF BAM
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 12
0
def _generate_estimates(bam_file, out_base, failed_file, exts, data):
    background = {
        "dataset": "1000g.phase3",
        "nvars": "100k",
        "build": "b38" if dd.get_genome_build(data) == "hg38" else "b37"
    }
    with file_transaction(data, out_base) as tx_out_base:
        cmd = [
            "verifybamid2", background["dataset"], background["nvars"],
            background["build"], "--Reference",
            dd.get_ref_file(data), "--Output", tx_out_base
        ]
        cmd += _get_input_args(bam_file, data, out_base, background)
        try:
            do.run(cmd, "VerifyBamID contamination checks")
        except subprocess.CalledProcessError, msg:

            def allowed_errors(l):
                return (l.find("Insufficient Available markers") >= 0
                        or l.find("No reads found in any of the regions") >= 0)

            if any([allowed_errors(l) for l in str(msg).split("\n")]):
                logger.info(
                    "Skipping VerifyBamID, not enough overlapping markers found: %s"
                    % dd.get_sample_name(data))
                with open(failed_file, "w") as out_handle:
                    out_handle.write(str(msg))
            else:
                logger.warning(str(msg))
                raise
        else:
Esempio n. 13
0
def _find_mirge(data):
    try:
        mirge = config_utils.get_program("miRge2.0", data)
        return mirge
    except config_utils.CmdNotFound:
        logger.warning("miRge2.0 is not found. Install it first, and try again.")
    return None
Esempio n. 14
0
def _mirtop(input_fn, sps, db, out_dir, config):
    """
    Convert to GFF3 standard format
    """
    hairpin = os.path.join(db, "hairpin.fa")
    gtf = os.path.join(db, "mirbase.gff3")
    if not file_exists(hairpin) or not file_exists(gtf):
        logger.warning("%s or %s are not installed. Skipping." % (hairpin, gtf))
        return None
    out_gtf_fn = "%s.gtf" % utils.splitext_plus(os.path.basename(input_fn))[0]
    out_gff_fn = "%s.gff" % utils.splitext_plus(os.path.basename(input_fn))[0]
    export = _get_env()
    cmd = ("{export} mirtop gff  --sps {sps} --hairpin {hairpin} "
           "--gtf {gtf} --format seqbuster -o {out_tx} {input_fn}")
    if not file_exists(os.path.join(out_dir, out_gtf_fn)) and \
       not file_exists(os.path.join(out_dir, out_gff_fn)):
        with tx_tmpdir() as out_tx:
            do.run(cmd.format(**locals()), "Do miRNA annotation for %s" % input_fn)
            with utils.chdir(out_tx):
                out_fn = out_gtf_fn if utils.file_exists(out_gtf_fn) \
                                    else out_gff_fn
                if utils.file_exists(out_fn):
                    shutil.move(os.path.join(out_tx, out_fn),
                                os.path.join(out_dir, out_fn))
    out_fn = out_gtf_fn if utils.file_exists(os.path.join(out_dir, out_gtf_fn)) \
                        else os.path.join(out_dir, out_gff_fn)
    if utils.file_exists(os.path.join(out_dir, out_fn)):
        return os.path.join(out_dir, out_fn)
Esempio n. 15
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = ("{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res").format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(rfam_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
            except:
                logger.warning("mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues.")
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Esempio n. 16
0
def _find_lib(data):
    """Find mirge libs"""
    options = " ".join(data.get('resources', {}).get('mirge', {}).get("options", ""))
    if options.find("-lib") > -1 and utils.file_exists(options.split()[1]):
        return options
    if not libs:
        logger.warning("miRge libraries not found. Follow these instructions to install them:")
    return libs
Esempio n. 17
0
def _generate_estimates(bam_file, out_base, failed_file, exts, data):
    background = {
        "dataset": "1000g.phase3",
        "nvars": "100k",
        "build": "b38" if dd.get_genome_build(data) == "hg38" else "b37"
    }
    with file_transaction(data, out_base) as tx_out_base:
        num_cores = dd.get_num_cores(data)
        cmd = [
            "verifybamid2", background["dataset"], background["nvars"],
            background["build"], "--Reference",
            dd.get_ref_file(data), "--Output", tx_out_base, "--NumThread",
            num_cores
        ]
        cmd += _get_input_args(bam_file, data, out_base, background)
        try:
            do.run(cmd, "VerifyBamID contamination checks")
        except subprocess.CalledProcessError as msg:

            def allowed_errors(l):
                return (l.find("Insufficient Available markers") >= 0
                        or l.find("No reads found in any of the regions") >= 0)

            if any([allowed_errors(l) for l in str(msg).split("\n")]):
                logger.info(
                    "Skipping VerifyBamID, not enough overlapping markers found: %s"
                    % dd.get_sample_name(data))
                with open(failed_file, "w") as out_handle:
                    out_handle.write(str(msg))
            else:
                logger.warning(str(msg))
                # don't escalate, it breaks some terminals on AWS Ubuntu
                # raise
        else:
            # Fix any sample name problems, for pileups
            shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig")
            with open(tx_out_base + ".selfSM.orig") as in_handle:
                with open(tx_out_base + ".selfSM", "w") as out_handle:
                    sample_name = None
                    for line in in_handle:
                        if line.startswith("DefaultSampleName"):
                            line = line.replace("DefaultSampleName",
                                                dd.get_sample_name(data))
                        # work around bug in finding SM from BAM RG at end of line
                        if len(line.strip().split("\t")) == 1:
                            sample_name = line.strip()
                            line = None
                        elif sample_name:
                            parts = line.split("\t")
                            parts[0] = sample_name
                            line = "\t".join(parts)
                            sample_name = None
                        if line:
                            out_handle.write(line)
            for e in exts + [".selfSM"]:
                if os.path.exists(tx_out_base + e):
                    shutil.copy(tx_out_base + e, out_base + e)
Esempio n. 18
0
def run_peddy(samples, out_dir=None):
    vcf_file = None
    for d in samples:
        vcinfo = variant.get_active_vcinfo(d)
        if vcinfo and vcinfo.get("vrn_file") and utils.file_exists(vcinfo["vrn_file"]):
            if vcinfo["vrn_file"] and dd.get_sample_name(d) in vcfutils.get_samples(vcinfo["vrn_file"]):
                vcf_file = vcinfo["vrn_file"]
                break
    data = samples[0]
    peddy = config_utils.get_program("peddy", data) if config_utils.program_installed("peddy", data) else None
    if not peddy or not vcf_file or not is_human(data):
        logger.info("peddy is not installed, not human or sample VCFs don't match, skipping correspondence checking "
                    "for %s." % vcf_file)
        return samples
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"
    peddyfiles = expected_peddy_files(peddy_report, batch)
    if file_exists(peddy_report):
        return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    num_cores = dd.get_num_cores(data)

    with tx_tmpdir(data) as tx_dir:
        peddy_prefix_tx = os.path.join(tx_dir, os.path.basename(peddy_prefix))
        # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
        stderr_log = os.path.join(tx_dir, "run-stderr.log")
        cmd = "{peddy} -p {num_cores} --plot --prefix {peddy_prefix_tx} {vcf_file} {ped_file} 2> {stderr_log}"
        message = "Running peddy on {vcf_file} against {ped_file}."
        try:
            do.run(cmd.format(**locals()), message.format(**locals()))
        except:
            to_show = collections.deque(maxlen=100)
            with open(stderr_log) as in_handle:
                for line in in_handle:
                    to_show.append(line)
            if any([l.find("IndexError") >=0 and l.find("is out of bounds for axis") >= 0
                    for l in to_show]):
                logger.info("Skipping peddy because no variants overlap with checks: %s" % batch)
                with open(peddy_prefix + "-failed.log", "w") as out_handle:
                    out_handle.write("peddy did not find overlaps with 1kg sites in VCF, skipping")
                return samples
            else:
                logger.warning("".join(to_show))
                raise
        for ext in PEDDY_OUT_EXTENSIONS:
            if os.path.exists(peddy_prefix_tx + ext):
                shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)
Esempio n. 19
0
def _download_srx(url, out_dir):
    cmd = "wget -N -r -nH -nd -np -nv {0}".format(url)
    out_dir = os.path.abspath(utils.safe_makedir(out_dir))
    with utils.chdir(out_dir):
        try:
            do.run(cmd, "Download %s" % url )
        except:
            logger.warning("Sample path not found in database. Skipping.")
            traceback.print_exc()
            return None
    return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]
Esempio n. 20
0
def _download_srx(srxid, url, out_dir):
    cmd = "wget -N -r -nH -nd -np -nv {0}".format(url)
    out_dir = os.path.abspath(utils.safe_makedir(out_dir))
    with utils.chdir(out_dir):
        try:
            do.run(cmd, "Download %s" % url)
        except:
            logger.warning("Sample path not found in database. Skipping.")
            traceback.print_exc()
            return None
    return [os.path.join(out_dir, fn) for fn in os.listdir(out_dir)]
Esempio n. 21
0
def summary(*samples):
    """Summarize all quality metrics together"""
    samples = utils.unpack_worlds(samples)
    work_dir = dd.get_work_dir(samples[0])
    multiqc = config_utils.get_program("multiqc", samples[0]["config"])
    if not multiqc:
        logger.debug("multiqc not found. Update bcbio_nextgen.py tools to fix this issue.")
    folders = []
    opts = ""
    out_dir = os.path.join(work_dir, "multiqc")
    out_data = os.path.join(work_dir, "multiqc", "multiqc_data")
    out_file = os.path.join(out_dir, "multiqc_report.html")
    samples = _report_summary(samples, os.path.join(out_dir, "report"))
    for data in samples:
        for program, pfiles in tz.get_in(["summary", "qc"], data, {}).iteritems():
            if isinstance(pfiles, dict):
                pfiles = pfiles["base"]
            folders.append(os.path.dirname(pfiles))
    # XXX temporary workaround until we can handle larger inputs through MultiQC
    folders = list(set(folders))
    if len(folders) > 250:
        logger.warning("Too many samples for MultiQC, only using first 250 entries.")
        folders = folders[:250]
        opts = "--flat"
    # Back compatible -- to migrate to explicit specifications in input YAML
    folders += ["trimmed", "htseq-count/*summary"]
    if not utils.file_exists(out_file):
        with utils.chdir(work_dir):
            input_dir = " ".join([_check_multiqc_input(d) for d in folders])
            export_tmp = ""
            if dd.get_tmp_dir(samples[0]):
                export_tmp = "export TMPDIR=%s &&" % dd.get_tmp_dir(samples[0])
            if input_dir.strip():
                cmd = "{export_tmp} {multiqc} -f {input_dir} -o {tx_out} {opts}"
                with tx_tmpdir(data, work_dir) as tx_out:
                    do.run(cmd.format(**locals()), "Run multiqc")
                    if utils.file_exists(os.path.join(tx_out, "multiqc_report.html")):
                        shutil.move(os.path.join(tx_out, "multiqc_report.html"), out_file)
                        shutil.move(os.path.join(tx_out, "multiqc_data"), out_data)
    out = []
    for i, data in enumerate(samples):
        if i == 0:
            if utils.file_exists(out_file):
                data_files = glob.glob(os.path.join(out_dir, "multiqc_data", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.bed"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.txt"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*", "*.tsv"))
                data_files += glob.glob(os.path.join(out_dir, "report", "*.R*"))
                if "summary" not in data:
                    data["summary"] = {}
                data["summary"]["multiqc"] = {"base": out_file, "secondary": data_files}
        out.append(data)
    return [[d] for d in out]
Esempio n. 22
0
def chipqc(bam_file, sample, out_dir):
    """Attempt code to run ChIPQC bioconductor packate in one sample"""
    work_dir = dd.get_work_dir(sample)
    sample_name = dd.get_sample_name(sample)
    logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.")
    if utils.file_exists(out_dir):
        return _get_output(out_dir)
    with tx_tmpdir() as tmp_dir:
        rcode = _sample_template(sample, tmp_dir)
        # local_sitelib = utils.R_sitelib()
        rscript = utils.Rscript_cmd()
        do.run([rscript, rcode], "ChIPQC in %s" % sample_name, log_error=False)
        shutil.move(tmp_dir, out_dir)
    return _get_output(out_dir)
Esempio n. 23
0
def chipqc(bam_file, sample, out_dir):
    """Attempt code to run ChIPQC bioconductor packate in one sample"""
    sample_name = dd.get_sample_name(sample)
    logger.warning("ChIPQC is unstable right now, if it breaks, turn off the tool.")
    if utils.file_exists(out_dir):
        return _get_output(out_dir)
    with tx_tmpdir() as tmp_dir:
        rcode = _sample_template(sample, tmp_dir)
        if rcode:
            # local_sitelib = utils.R_sitelib()
            rscript = utils.Rscript_cmd()
            do.run([rscript, "--no-environ", rcode], "ChIPQC in %s" % sample_name, log_error=False)
            shutil.move(tmp_dir, out_dir)
    return _get_output(out_dir)
Esempio n. 24
0
def _get_env():
    conda = os.path.join(os.path.dirname(sys.executable), "conda")
    anaconda = os.path.join(os.path.dirname(sys.executable), "..")
    cl = ("{conda} list --json -f seqbuster").format(**locals())
    with closing(subprocess.Popen(cl, stdout=subprocess.PIPE,
                                  stderr=subprocess.STDOUT, shell=True).stdout) as stdout:
        try:
            version = stdout.readlines()[2].strip().split()[1]
            if LooseVersion(version) >= LooseVersion("3"):
                logger.info("miraligner version %s" % version)
                return "JAVA_HOME=%s && " % anaconda
        except:
            logger.warning("Cannot detect miraligner version, asumming latest.")
    return ""
Esempio n. 25
0
def quantitate(data):
    """CWL target for quantitation.

    XXX Needs to be split and parallelized by expression caller, with merging
    of multiple calls.
    """
    data = to_single_data(to_single_data(data))
    data = generate_transcript_counts(data)[0][0]
    data["quant"] = {}
    if "sailfish" in dd.get_expression_caller(data):
        data = to_single_data(sailfish.run_sailfish(data)[0])
        data["quant"]["tsv"] = data["sailfish"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["sailfish"]),
                                             "abundance.h5")
    if ("kallisto" in dd.get_expression_caller(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        data = to_single_data(kallisto.run_kallisto_rnaseq(data)[0])
        data["quant"]["tsv"] = os.path.join(data["kallisto_quant"],
                                            "abundance.tsv")
        data["quant"]["hdf5"] = os.path.join(data["kallisto_quant"],
                                             "abundance.h5")
    if (os.path.exists(os.path.join(data["kallisto_quant"], "fusion.txt"))):
        data["quant"]["fusion"] = os.path.join(data["kallisto_quant"],
                                               "fusion.txt")
    else:
        data["quant"]["fusion"] = None
    if "salmon" in dd.get_expression_caller(data):
        if dd.get_quantify_genome_alignments(data):
            if dd.get_aligner(data).lower() != "star":
                if dd.get_genome_build(data) == "hg38":
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Since this is hg38 we will fall "
                        "back to the decoy method")
                    data = to_single_data(salmon.run_salmon_decoy(data)[0])
                else:
                    logger.warning(
                        "Whole genome alignment-based Salmon quantification is "
                        "only supported for the STAR aligner. Falling back to the "
                        "transcriptome-only method.")
                    data = to_single_data(salmon.run_salmon_reads(data)[0])
            else:
                data = to_single_data(salmon.run_salmon_bam(data)[0])
        else:
            data = to_single_data(salmon.run_salmon_reads(data)[0])
        data["quant"]["tsv"] = data["salmon"]
        data["quant"]["hdf5"] = os.path.join(os.path.dirname(data["salmon"]),
                                             "abundance.h5")
    return [[data]]
def _check_stems(files):
    """check if stem names are the same and use full path then"""
    used = set()
    for fn in files:
        if os.path.basename(fn) in used:
            logger.warning("%s stem is multiple times in your file list, "
                         "so we don't know "
                         "how to assign it to the sample data in the CSV. "
                         "We are gonna use full path to make a difference, "
                         "that means paired files should be in the same folder. "
                         "If this is a problem, you should rename the files you want "
                         "to merge. Sorry, no possible magic here." % os.path.basename(fn)
                         )
            return True
        used.add(os.path.basename(fn))
    return False
Esempio n. 27
0
def detect_fusions(data):
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    if "oncofuse" in dd.get_fusion_caller(data, []):
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in dd.get_fusion_caller(data, []):
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
    return [[data]]
Esempio n. 28
0
def call_consensus(samples):
    """
    call consensus peaks on the narrowPeak files from a set of
    ChiP/ATAC samples
    """
    data = samples[0][0]
    new_samples = []
    consensusdir = os.path.join(dd.get_work_dir(data), "consensus")
    utils.safe_makedir(consensusdir)
    peakfiles = []
    for data in dd.sample_data_iterator(samples):
        if dd.get_chip_method(data) == "chip":
            for fn in tz.get_in(("peaks_files", "macs2"), data, []):
                if "narrowPeak" in fn:
                    peakfiles.append(fn)
                elif "broadPeak" in fn:
                    peakfiles.append(fn)
        elif dd.get_chip_method(data) == "atac":
            if bam.is_paired(dd.get_work_bam(data)):
                for fn in tz.get_in(("peaks_files", "NF", "macs2"), data, []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
            else:
                logger.info(
                    f"Using peaks from full fraction since {dd.get_work_bam(data)} is single-ended."
                )
                for fn in tz.get_in(("peaks_files", "full", "macs2"), data,
                                    []):
                    if "narrowPeak" in fn:
                        peakfiles.append(fn)
    consensusfile = os.path.join(consensusdir, "consensus.bed")
    if not peakfiles:
        logger.info(
            "No suitable peak files found, skipping consensus peak calling.")
        return samples
    consensusfile = consensus(peakfiles, consensusfile, data)
    if not utils.file_exists(consensusfile):
        logger.warning("No consensus peaks found.")
        return samples
    saffile = consensus_to_saf(consensusfile,
                               os.path.splitext(consensusfile)[0] + ".saf")
    for data in dd.sample_data_iterator(samples):
        data = tz.assoc_in(data, ("peaks_files", "consensus"),
                           {"main": consensusfile})
        new_samples.append([data])
    return new_samples
def _check_stems(files):
    """check if stem names are the same and use full path then"""
    used = set()
    for fn in files:
        if os.path.basename(fn) in used:
            logger.warning(
                "%s stem is multiple times in your file list, "
                "so we don't know "
                "how to assign it to the sample data in the CSV. "
                "We are gonna use full path to make a difference, "
                "that means paired files should be in the same folder. "
                "If this is a problem, you should rename the files you want "
                "to merge. Sorry, no possible magic here." %
                os.path.basename(fn))
            return True
        used.add(os.path.basename(fn))
    return False
Esempio n. 30
0
def _find_lib(data):
    """Find mirge libs"""
    options = " ".join(data.get('resources', {}).get('mirge', {}).get("options", ""))
    if options.find("-lib") > -1 and utils.file_exists(options.split()[1]):
        return options
    if not options:
        logger.warning("miRge libraries not found. Follow these instructions to install them:")
        logger.warning("https://github.com/mhalushka/miRge#download-libraries")
        logger.warning("Then, pass -lib LIB_PATH with resourcces:mirge:options:[...]")
        logger.warning("More information: https://bcbio-nextgen.readthedocs.io/en/latest/contents/pipelines.html#smallrna-seq")
Esempio n. 31
0
def _generate_estimates(bam_file, out_base, failed_file, exts, data):
    background = {"dataset": "1000g.phase3",
                  "nvars": "100k",
                  "build":"b38" if dd.get_genome_build(data) == "hg38" else "b37"}
    with file_transaction(data, out_base) as tx_out_base:
        cmd = ["verifybamid2", background["dataset"], background["nvars"], background["build"],
               "--Reference", dd.get_ref_file(data), "--Output", tx_out_base]
        cmd += _get_input_args(bam_file, data, out_base, background)
        try:
            do.run(cmd, "VerifyBamID contamination checks")
        except subprocess.CalledProcessError as msg:
            def allowed_errors(l):
                return (l.find("Insufficient Available markers") >= 0 or
                        l.find("No reads found in any of the regions") >= 0)
            if any([allowed_errors(l) for l in str(msg).split("\n")]):
                logger.info("Skipping VerifyBamID, not enough overlapping markers found: %s" %
                            dd.get_sample_name(data))
                with open(failed_file, "w") as out_handle:
                    out_handle.write(str(msg))
            else:
                logger.warning(str(msg))
                raise
        else:
            # Fix any sample name problems, for pileups
            shutil.move(tx_out_base + ".selfSM", tx_out_base + ".selfSM.orig")
            with open(tx_out_base + ".selfSM.orig") as in_handle:
                with open(tx_out_base + ".selfSM", "w") as out_handle:
                    sample_name = None
                    for line in in_handle:
                        if line.startswith("DefaultSampleName"):
                            line = line.replace("DefaultSampleName", dd.get_sample_name(data))
                        # work around bug in finding SM from BAM RG at end of line
                        if len(line.strip().split("\t")) == 1:
                            sample_name = line.strip()
                            line = None
                        elif sample_name:
                            parts = line.split("\t")
                            parts[0] = sample_name
                            line = "\t".join(parts)
                            sample_name = None
                        if line:
                            out_handle.write(line)
            for e in exts + [".selfSM"]:
                if os.path.exists(tx_out_base + e):
                    shutil.copy(tx_out_base + e, out_base + e)
Esempio n. 32
0
def _get_env():
    conda = os.path.join(os.path.dirname(sys.executable), "conda")
    anaconda = os.path.join(os.path.dirname(sys.executable), "..")
    cl = ("{conda} list --json -f seqbuster").format(**locals())
    with closing(
            subprocess.Popen(cl,
                             stdout=subprocess.PIPE,
                             stderr=subprocess.STDOUT,
                             shell=True).stdout) as stdout:
        try:
            version = stdout.readlines()[2].strip().split()[1]
            if LooseVersion(version) >= LooseVersion("3"):
                logger.info("miraligner version %s" % version)
                return "JAVA_HOME=%s && " % anaconda
        except:
            logger.warning(
                "Cannot detect miraligner version, asumming latest.")
    return ""
Esempio n. 33
0
def quantitate_expression_parallel(samples, run_parallel):
    """
    quantitate expression, all programs run here should be multithreaded to
    take advantage of the threaded run_parallel environment
    """
    data = samples[0][0]
    to_index = determine_indexes_to_make(samples)
    samples = run_parallel("generate_transcript_counts", samples)
    if "cufflinks" in dd.get_expression_caller(data):
        samples = run_parallel("run_cufflinks", samples)
    if "stringtie" in dd.get_expression_caller(data):
        samples = run_parallel("run_stringtie_expression", samples)
    if ("kallisto" in dd.get_expression_caller(data)
            or dd.get_fusion_mode(data)
            or "pizzly" in dd.get_fusion_caller(data, [])):
        run_parallel("run_kallisto_index", [to_index])
        samples = run_parallel("run_kallisto_rnaseq", samples)
    if "sailfish" in dd.get_expression_caller(data):
        run_parallel("run_sailfish_index", [to_index])
        samples = run_parallel("run_sailfish", samples)

    # always run salmon
    run_parallel("run_salmon_index", [to_index])
    if dd.get_quantify_genome_alignments(data):
        if dd.get_aligner(data).lower() != "star":
            if dd.get_genome_build(data) == "hg38":
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Since this is hg38 we will fall "
                    "back to the decoy method")
                samples = run_parallel("run_salmon_decoy", samples)
            else:
                logger.warning(
                    "Whole genome alignment-based Salmon quantification is "
                    "only supported for the STAR aligner. Falling back to the "
                    "transcriptome-only method.")
                samples = run_parallel("run_salmon_reads", samples)
        else:
            samples = run_parallel("run_salmon_bam", samples)
    else:
        samples = run_parallel("run_salmon_reads", samples)

    samples = run_parallel("detect_fusions", samples)
    return samples
Esempio n. 34
0
def run(data):
    if not aligner_supports_fusion(data):
        aligner = dd.get_aligner(data)
        logger.warning("Oncofuse is not supported for the %s aligner, "
                       "skipping. " % aligner)
        return None
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            if file_exists(input_file):
                input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            if file_exists(input_file):
                input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse = config_utils.get_program("oncofuse", config)

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = [oncofuse]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += [input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run(
                    "touch %s && echo '# failed' >> %s" %
                    (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Esempio n. 35
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar("bcbio.coverage", config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning("No coverage calculations: Did not find bcbio.coverage jar from system config")
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file), "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    jvm_opts = resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"])
    java_args = ["-Djava.io.tmpdir=%s" % tmp_dir]
    cmd = ["java"] + jvm_opts + java_args + ["-jar", bc_jar, "multicompare", config_file,
                                             out_file, "-c", str(config["algorithm"]["num_cores"])]
    do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 36
0
def run(data):
    if not aligner_supports_fusion(data):
        aligner = dd.get_aligner(data)
        logger.warning("Oncofuse is not supported for the %s aligner, "
                       "skipping. " % aligner)
        return None
    config = data["config"]
    genome_build = data.get("genome_build", "")
    input_type, input_dir, input_file = _get_input_para(data)
    if genome_build == "GRCh37":  # assume genome_build is hg19 otherwise
        if config["algorithm"].get("aligner") in ["star"]:
            if file_exists(input_file):
                input_file = _fix_star_junction_output(input_file)
        if config["algorithm"].get("aligner") in ["tophat", "tophat2"]:
            if file_exists(input_file):
                input_file = _fix_tophat_junction_output(input_file)
    elif "hg19" not in genome_build:
        return None
    #handle cases when fusion file doesn't exist
    if not file_exists(input_file):
        return None
    out_file = os.path.join(input_dir, "oncofuse_out.txt")
    if file_exists(out_file):
        return out_file
    oncofuse = config_utils.get_program("oncofuse", config)

    tissue_type = _oncofuse_tissue_arg_from_config(data)
    resources = config_utils.get_resources("oncofuse", config)
    if not file_exists(out_file):
        cl = [oncofuse]
        cl += resources.get("jvm_opts", ["-Xms750m", "-Xmx5g"])
        with file_transaction(data, out_file) as tx_out_file:
            cl += [input_file, input_type, tissue_type, tx_out_file]
            cmd = " ".join(cl)
            try:
                do.run(cmd, "oncofuse fusion detection", data)
            except:
                do.run("touch %s && echo '# failed' >> %s" % (tx_out_file, tx_out_file), "oncofuse failed", data)
                #return out_file
    return out_file
Esempio n. 37
0
def summary(samples, config):
    """Provide summary information on a single sample across regions of interest.
    """
    try:
        bc_jar = config_utils.get_jar(
            "bcbio.coverage",
            config_utils.get_program("bcbio_coverage", config, "dir"))
    except ValueError:
        logger.warning(
            "No coverage calculations: Did not find bcbio.coverage jar from system config"
        )
        return [[x] for x in samples]
    config_file, out_file = _prep_coverage_config(samples, config)
    tmp_dir = utils.safe_makedir(os.path.join(os.path.dirname(out_file),
                                              "tmp"))
    resources = config_utils.get_resources("bcbio_coverage", config)
    config = copy.deepcopy(config)
    config["algorithm"]["memory_adjust"] = {
        "direction": "increase",
        "magnitude": config["algorithm"].get("num_cores", 1)
    }
    jvm_opts = config_utils.adjust_opts(
        resources.get("jvm_opts", ["-Xms750m", "-Xmx2g"]), config)
    if not utils.file_exists(out_file):
        with file_transaction(out_file) as tx_out_file:
            java_args = [
                "-Djava.io.tmpdir=%s" % tmp_dir, "-Djava.awt.headless=true"
            ]
            cmd = ["java"] + jvm_opts + java_args + [
                "-jar", bc_jar, "multicompare", config_file, tx_out_file, "-c",
                str(config["algorithm"].get("num_cores", 1))
            ]
            do.run(cmd, "Summarizing coverage with bcbio.coverage", samples[0])
    out = []
    for x in samples:
        x["coverage"] = {"summary": out_file}
        out.append([x])
    return out
Esempio n. 38
0
def detect_fusions(data):
    data = to_single_data(data)
    # support the old style of fusion mode calling
    if dd.get_fusion_mode(data, False):
        data = dd.set_fusion_caller(data, ["oncofuse", "pizzly"])
        logger.warning("``fusion_mode`` is deprecated in favor of turning on "
                       "callers with ``fusion_caller``. It will run pizzly and "
                       "oncofuse for now, but will eventually have support "
                       "dropped.")
    fusion_caller = dd.get_fusion_caller(data, [])
    if "oncofuse" in fusion_caller:
        oncofuse_file = oncofuse.run(data)
        if oncofuse_file:
            data = dd.set_oncofuse_file(data, oncofuse_file)
    if "pizzly" in fusion_caller:
        pizzly_dir = pizzly.run_pizzly(data)
        if pizzly_dir:
            data = dd.set_pizzly_dir(data, pizzly_dir)
            data["fusion"] = {"fasta": os.path.join(pizzly_dir, "%s.fusions.fasta" % dd.get_sample_name(data)),
                              "json": os.path.join(pizzly_dir, "%s.json" % dd.get_sample_name(data))}
    if "ericscript" in fusion_caller:
        ericscript_dir = ericscript.run(data)
    return [[data]]
Esempio n. 39
0
def _find_lib(data):
    """Find mirge libs"""
    options = " ".join(
        data.get('resources', {}).get('mirge', {}).get("options", ""))
    if options.find("-lib") > -1 and utils.file_exists(options.split()[1]):
        return options
    if not options:
        logger.warning(
            "miRge libraries not found. Follow these instructions to install them:"
        )
        logger.warning("https://github.com/mhalushka/miRge#download-libraries")
        logger.warning(
            "Then, pass -lib LIB_PATH with resourcces:mirge:options:[...]")
        logger.warning(
            "More information: https://bcbio-nextgen.readthedocs.io/en/latest/contents/pipelines.html#smallrna-seq"
        )
Esempio n. 40
0
def run(data):
    config = data[0][0]['config']
    work_dir = dd.get_work_dir(data[0][0])
    genome = dd.get_ref_file(data[0][0])
    mirdeep2 = os.path.join(os.path.dirname(sys.executable), "miRDeep2.pl")
    perl_exports = get_perl_exports()
    hairpin, mature, species = "none", "none", "na"
    rfam_file = dd.get_mirdeep2_file(data[0][0])
    if file_exists(dd.get_mirbase_hairpin(data[0][0])):
        species = dd.get_species(data[0][0])
        hairpin = dd.get_mirbase_hairpin(data[0][0])
        mature = dd.get_mirbase_mature(data[0][0])

    logger.debug("Preparing for mirdeep2 analysis.")
    bam_file = op.join(work_dir, "align", "seqs.bam")
    seqs_dir = op.join(work_dir, "seqcluster", "prepare")
    collapsed = op.join(seqs_dir, "seqs.ma")
    out_dir = op.join(work_dir, "mirdeep2")
    out_file = op.join(out_dir, "result_res.csv")
    safe_makedir(out_dir)
    with chdir(out_dir):
        collapsed, bam_file = _prepare_inputs(collapsed, bam_file, out_dir)
        cmd = (
            "{perl_exports} && perl {mirdeep2} {collapsed} {genome} {bam_file} {mature} none {hairpin} -f {rfam_file} -r simple -c -P -t {species} -z res"
        ).format(**locals())
        if file_exists(mirdeep2) and not file_exists(out_file) and file_exists(
                rfam_file):
            try:
                do.run(cmd.format(**locals()), "Running mirdeep2.")
            except:
                logger.warning(
                    "mirdeep2 failed. Please report the error to https://github.com/lpantano/mirdeep2_core/issues."
                )
        if file_exists(out_file):
            novel_db = _parse_novel(out_file, dd.get_species(data[0][0]))
            return novel_db
Esempio n. 41
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    work_bam = dd.get_work_bam(data)
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 42
0
def clean_chipseq_alignment(data):
    # lcr_bed = utils.get_in(data, ("genome_resources", "variation", "lcr"))
    method = dd.get_chip_method(data)
    if method == "atac":
        data = shift_ATAC(data)
    work_bam = dd.get_work_bam(data)
    work_bam = bam.sort(work_bam, dd.get_config(data))
    bam.index(work_bam, dd.get_config(data))
    # an unfiltered BAM file is useful for calculating some metrics later
    data = tz.assoc_in(data, ['chipseq', 'align', "unfiltered"], work_bam)
    clean_bam = remove_nonassembled_chrom(work_bam, data)
    clean_bam = remove_mitochondrial_reads(clean_bam, data)
    data = atac.calculate_complexity_metrics(clean_bam, data)
    if not dd.get_keep_multimapped(data):
        clean_bam = remove_multimappers(clean_bam, data)
    if not dd.get_keep_duplicates(data):
        clean_bam = bam.remove_duplicates(clean_bam, data)
    data["work_bam"] = clean_bam
    # for ATAC-seq, brewak alignments into NF, mono/di/tri nucleosome BAM files
    if method == "atac":
        data = atac.split_ATAC(data)
    encode_bed = tz.get_in(
        ["genome_resources", "variation", "encode_blacklist"], data)
    if encode_bed:
        data["work_bam"] = remove_blacklist_regions(dd.get_work_bam(data),
                                                    encode_bed, data['config'])
        bam.index(data["work_bam"], data['config'])
    try:
        data["bigwig"] = _normalized_bam_coverage(dd.get_sample_name(data),
                                                  dd.get_work_bam(data), data)
    except subprocess.CalledProcessError:
        logger.warning(f"{dd.get_work_bam(data)} was too sparse to normalize, "
                       f" falling back to non-normalized coverage.")
        data["bigwig"] = _bam_coverage(dd.get_sample_name(data),
                                       dd.get_work_bam(data), data)
    return [[data]]
Esempio n. 43
0
def _check_java_version(config, items):
    msg = java(config, items)
    if msg:
        logger.warning("miraligner is only compatible with java 1.7")
        return False
    return True
Esempio n. 44
0
def combine_pairs(input_files, force_single=False, full_name=False, separators=None):
    """ calls files pairs if they are completely the same except
    for one has _1 and the other has _2 returns a list of tuples
    of pairs or singles.
    From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py)
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"])

    pairs = []
    used = set([])
    used_separators = set([])
    separators = separators if separators else ("R", "_", "-", ".")
    for in_file in input_files:
        matches = set([])
        if in_file in used:
            continue
        if not force_single:
            for comp_file in input_files:
                if comp_file in used or comp_file == in_file:
                    continue
                if full_name:
                    in_file_name = in_file
                    comp_file_name = comp_file
                else:
                    in_file_name = os.path.basename(in_file)
                    comp_file_name = os.path.basename(comp_file)

                a = rstrip_extra(utils.splitext_plus(in_file_name)[0])
                b = rstrip_extra(utils.splitext_plus(comp_file_name)[0])
                if len(a) != len(b):
                    continue
                s = dif(a,b)
                # no differences, then its the same file stem
                if len(s) == 0:
                    logger.error("%s and %s have the same stem, so we don't know "
                                 "how to assign it to the sample data in the CSV. To "
                                 "get around this you can rename one of the files. "
                                 "If they are meant to be the same sample run in two "
                                 "lanes, combine them first with the "
                                 "bcbio_prepare_samples.py script."
                                 "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)"
                                 % (in_file, comp_file))
                    # continue
                    sys.exit(1)
                if len(s) > 1:
                    continue #there is more than 1 difference
                if (a[s[0]] in PAIR_FILE_IDENTIFIERS and
                      b[s[0]] in PAIR_FILE_IDENTIFIERS):
                    # if the 1/2 isn't the last digit before a separator, skip
                    # this skips stuff like 2P 2A, often denoting replicates, not
                    # read pairings
                    if len(b) > (s[0] + 1):
                        if (b[s[0]+1] not in ("_", "-", ".")):
                            continue
                    # if the 1/2 is not a separator or prefaced with R, skip
                    if b[s[0] - 1] in separators:
                        used_separators.add(b[s[0] - 1])
                        if len(used_separators) > 1:
                            logger.warning("To split into paired reads multiple separators were used: %s" % used_separators)
                            logger.warning("This can lead to wrong assignation.")
                            logger.warning("Use --separator option in bcbio_prepare_samples.py to specify only one.")
                            logger.warning("For instance, --separator R.")
                        matches.update([in_file, comp_file])
                        used.update([in_file, comp_file])

            if matches:
                pairs.append(sort_filenames(list(matches)))
        if in_file not in used:
            pairs.append([in_file])
            used.add(in_file)
    return pairs
Esempio n. 45
0
def combine_pairs(input_files,
                  force_single=False,
                  full_name=False,
                  separators=None):
    """ calls files pairs if they are completely the same except
    for one has _1 and the other has _2 returns a list of tuples
    of pairs or singles.
    From bipy.utils (https://github.com/roryk/bipy/blob/master/bipy/utils.py)
    Adjusted to allow different input paths or extensions for matching files.
    """
    PAIR_FILE_IDENTIFIERS = set(["1", "2", "3", "4"])

    pairs = []
    used = set([])
    used_separators = set([])
    separators = separators if separators else ("R", "_", "-", ".")
    for in_file in input_files:
        matches = set([])
        if in_file in used:
            continue
        if not force_single:
            for comp_file in input_files:
                if comp_file in used or comp_file == in_file:
                    continue
                if full_name:
                    in_file_name = in_file
                    comp_file_name = comp_file
                else:
                    in_file_name = os.path.basename(in_file)
                    comp_file_name = os.path.basename(comp_file)

                a = rstrip_extra(utils.splitext_plus(in_file_name)[0])
                b = rstrip_extra(utils.splitext_plus(comp_file_name)[0])
                if len(a) != len(b):
                    continue
                s = dif(a, b)
                # no differences, then its the same file stem
                if len(s) == 0:
                    logger.error(
                        "%s and %s have the same stem, so we don't know "
                        "how to assign it to the sample data in the CSV. To "
                        "get around this you can rename one of the files. "
                        "If they are meant to be the same sample run in two "
                        "lanes, combine them first with the "
                        "bcbio_prepare_samples.py script."
                        "(http://bcbio-nextgen.readthedocs.io/en/latest/contents/configuration.html#multiple-files-per-sample)"
                        % (in_file, comp_file))
                    # continue
                    sys.exit(1)
                if len(s) > 1:
                    continue  #there is more than 1 difference
                if (a[s[0]] in PAIR_FILE_IDENTIFIERS
                        and b[s[0]] in PAIR_FILE_IDENTIFIERS):
                    # if the 1/2 isn't the last digit before a separator, skip
                    # this skips stuff like 2P 2A, often denoting replicates, not
                    # read pairings
                    if len(b) > (s[0] + 1):
                        if (b[s[0] + 1] not in ("_", "-", ".")):
                            continue
                    # if the 1/2 is not a separator or prefaced with R, skip
                    if b[s[0] - 1] in separators:
                        used_separators.add(b[s[0] - 1])
                        if len(used_separators) > 1:
                            logger.warning(
                                "To split into paired reads multiple separators were used: %s"
                                % used_separators)
                            logger.warning(
                                "This can lead to wrong assignation.")
                            logger.warning(
                                "Use --separator option in bcbio_prepare_samples.py to specify only one."
                            )
                            logger.warning("For instance, --separator R.")
                        matches.update([in_file, comp_file])
                        used.update([in_file, comp_file])

            if matches:
                pairs.append(sort_filenames(list(matches)))
        if in_file not in used:
            pairs.append([in_file])
            used.add(in_file)
    return pairs
Esempio n. 46
0
def _check_java_version(config, items):
    msg = java(config, items)
    if msg:
        logger.warning("miraligner is only compatible with java 1.7")
        return False
    return True
Esempio n. 47
0
def run_peddy(samples, out_dir=None):
    data = samples[0]
    batch = dd.get_batch(data) or dd.get_sample_name(data)
    if isinstance(batch, (list, tuple)):
        batch = batch[0]
    if out_dir:
        peddy_dir = safe_makedir(out_dir)
    else:
        peddy_dir = safe_makedir(
            os.path.join(dd.get_work_dir(data), "qc", batch, "peddy"))
    peddy_prefix = os.path.join(peddy_dir, batch)
    peddy_report = peddy_prefix + ".html"

    vcf_file = None
    for d in samples:
        vcinfo = None
        if dd.get_phenotype(d) == "germline" or dd.get_phenotype(d) not in [
                "tumor"
        ]:
            vcinfo = variant.get_active_vcinfo(d, use_ensemble=False)
        if not vcinfo and dd.get_phenotype(d) in ["tumor"]:
            vcinfo = variant.extract_germline_vcinfo(d, peddy_dir)
        if vcinfo:
            for key in ["germline", "vrn_file"]:
                if vcinfo and vcinfo.get(key) and utils.file_exists(
                        vcinfo[key]):
                    if vcinfo[key] and dd.get_sample_name(
                            d) in vcfutils.get_samples(vcinfo[key]):
                        if vcinfo[
                                key] and vcfutils.vcf_has_nonfiltered_variants(
                                    vcinfo[key]):
                            vcf_file = vcinfo[key]
                            break
    peddy = config_utils.get_program("peddy",
                                     data) if config_utils.program_installed(
                                         "peddy", data) else None
    config_skips = any(["peddy" in dd.get_tools_off(d) for d in samples])
    if not peddy or not vcf_file or not vcfanno.is_human(data) or config_skips:
        if not peddy:
            reason = "peddy executable not found"
        elif config_skips:
            reason = "peddy in tools_off configuration"
        elif not vcfanno.is_human(data):
            reason = "sample is not human"
        else:
            assert not vcf_file
            reason = "no suitable VCF files found with the sample and non-filtered variants"
        msg = "Skipping peddy QC, %s: %s" % (
            reason, [dd.get_sample_name(d) for d in samples])
        with open(peddy_prefix + "-failed.log", "w") as out_handle:
            out_handle.write(msg)
        logger.info(msg)
        return samples
    if file_exists(peddy_prefix + "-failed.log"):
        return samples
    if not file_exists(peddy_report):
        ped_file = create_ped_file(samples, vcf_file, out_dir=out_dir)
        num_cores = dd.get_num_cores(data)
        with tx_tmpdir(data) as tx_dir:
            peddy_prefix_tx = os.path.join(tx_dir,
                                           os.path.basename(peddy_prefix))
            # Redirects stderr because incredibly noisy with no intervals found messages from cyvcf2
            stderr_log = os.path.join(tx_dir, "run-stderr.log")
            sites_str = "--sites hg38" if dd.get_genome_build(
                data) == "hg38" else ""
            locale = utils.locale_export()
            cmd = (
                "{locale} {peddy} -p {num_cores} {sites_str} --plot --prefix {peddy_prefix_tx} "
                "{vcf_file} {ped_file} 2> {stderr_log}")
            message = "Running peddy on {vcf_file} against {ped_file}."
            try:
                do.run(cmd.format(**locals()), message.format(**locals()))
            except:
                to_show = collections.deque(maxlen=100)
                with open(stderr_log) as in_handle:
                    for line in in_handle:
                        to_show.append(line)

                def allowed_errors(l):
                    return (
                        (l.find("IndexError") >= 0
                         and l.find("is out of bounds for axis") >= 0) or
                        (l.find("n_components=") >= 0
                         and l.find("must be between 1 and n_features=") >= 0)
                        or (l.find("n_components=") >= 0
                            and l.find("must be between 1 and min") >= 0)
                        or (l.find(
                            "Input contains NaN, infinity or a value too large for dtype"
                        ) >= 0))

                def all_line_errors(l):
                    return (l.find("no intervals found for") >= 0)

                if any([allowed_errors(l) for l in to_show]) or all(
                    [all_line_errors(l) for l in to_show]):
                    logger.info(
                        "Skipping peddy because no variants overlap with checks: %s"
                        % batch)
                    with open(peddy_prefix + "-failed.log", "w") as out_handle:
                        out_handle.write(
                            "peddy did not find overlaps with 1kg sites in VCF, skipping"
                        )
                    return samples
                else:
                    logger.warning("".join(to_show))
                    raise
            for ext in PEDDY_OUT_EXTENSIONS:
                if os.path.exists(peddy_prefix_tx + ext):
                    shutil.move(peddy_prefix_tx + ext, peddy_prefix + ext)
    peddyfiles = expected_peddy_files(peddy_report, batch)
    return dd.set_in_samples(samples, dd.set_summary_qc, peddyfiles)