Example #1
0
def _salmon_quant_reads(fq1, fq2, salmon_dir, index, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    cmd = ("{salmon} quant -l A -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Example #2
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, salmon_dir)
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "-o {tx_out_dir} ")
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
    return out_file
Example #3
0
def trim_srna_sample(data):
    """
    Remove 3' adapter for smallRNA-seq
    Uses cutadapt but with different parameters than for other pipelines.
    """
    in_file = data["files"][0]
    names = data["rgnames"]['sample']
    work_dir = os.path.join(dd.get_work_dir(data), "trimmed")
    out_dir = os.path.join(work_dir, names)
    utils.safe_makedir(out_dir)
    out_file = replace_directory(append_stem(in_file, ".clean"), out_dir)
    trim_reads = data["config"]["algorithm"].get("trim_reads", True)
    if utils.file_exists(out_file):
        data["clean_fastq"] = out_file
        data["collapse"] = _collapse(data["clean_fastq"])
        data["size_stats"] = _summary(data['collapse'])
        return [[data]]

    adapter = dd.get_adapters(data)
    if trim_reads and not adapter and error_dnapi:
        raise ValueError(error_dnapi)
    adapters = adapter if adapter else _dnapi_prediction(in_file, out_dir)
    times = "" if len(adapters) == 1 else "--times %s" % len(adapters)
    if trim_reads and adapters:
        adapter_cmd = " ".join(map(lambda x: "-a " + x, adapters))
        out_noadapter_file = replace_directory(append_stem(in_file, ".fragments"), out_dir)
        out_short_file = replace_directory(append_stem(in_file, ".short"), out_dir)
        log_out = os.path.join(out_dir, "%s.log" % names)
        atropos = _get_atropos()
        options = " ".join(data.get('resources', {}).get('atropos', {}).get("options", ""))
        cores = ("--threads %s" % dd.get_num_cores(data) if dd.get_num_cores(data) > 1 else "")
        if " ".join(data.get('resources', {}).get('cutadapt', {}).get("options", "")):
            raise ValueError("Atropos is now used, but cutadapt options found in YAML file."
                             "See https://atropos.readthedocs.io/en/latest/")
        cmd = _cmd_atropos()
        if not utils.file_exists(out_file):
            with file_transaction(out_file) as tx_out_file:
                do.run(cmd.format(**locals()), "remove adapter for %s" % names)
                if utils.file_exists(log_out):
                    content = open(log_out).read().replace(out_short_file, names)
                    open(log_out, 'w').write(content)
                if options:
                    in_file = append_stem(tx_out_file, ".tmp")
                    utils.move_safe(tx_out_file, in_file)
                    cmd = "{atropos} {cores} {options} -se {in_file} -o {tx_out_file} -m 17"
                    do.run(cmd.format(**locals()), "cutadapt with this %s for %s" %(options, names))
    else:
        if not trim_reads:
            logger.debug("Skip trimming for: %s" % names)
        elif not adapters:
            logger.info("No adapter founds in %s, this is an issue related"
                        " to no small RNA enrichment in your sample." % names)
        symlink_plus(in_file, out_file)
    data["clean_fastq"] = out_file
    data["collapse"] = _collapse(data["clean_fastq"])
    data["size_stats"] = _summary(data['collapse'])
    return [[data]]
Example #4
0
def _atropos_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with atropos.
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report_file, tx_out1 = tx_out[:2]
            if len(tx_out) > 2:
                tx_out2 = tx_out[2]
            # polyX trimming, anchored to the 3' ends of reads
            if "polyx" in dd.get_adapters(data):
                adapters += ["A{200}", "C{200}", "G{200}", "T{200}"]
            adapters_args = " ".join(["-a '%s'" % a for a in adapters])
            adapters_args += " --overlap 8"  # Avoid very short internal matches (default is 3)
            adapters_args += " --no-default-adapters --no-cache-adapters"  # Prevent GitHub queries and saving pickles
            aligner_args = "--aligner adapter"
            if len(fastq_files) == 1:
                cores = dd.get_num_cores(data)
                input_args = "-se %s" % objectstore.cl_input(fastq_files[0])
                output_args = "-o >(bgzip --threads {cores} -c > {tx_out1})".format(**locals())
            else:
                assert len(fastq_files) == 2, fastq_files
                cores = max(1, dd.get_num_cores(data) // 2)
                adapters_args = adapters_args + " " + " ".join(["-A '%s'" % a for a in adapters])
                input_args = "-pe1 %s -pe2 %s" % tuple([objectstore.cl_input(x) for x in fastq_files])
                output_args = ("-o >(bgzip --threads {cores} -c > {tx_out1}) "
                               "-p >(bgzip --threads {cores} -c > {tx_out2})").format(**locals())
            quality_base = "64" if dd.get_quality_format(data).lower() == "illumina" else "33"
            sample_name = dd.get_sample_name(data)
            report_args = "--report-file %s --report-formats json --sample-id %s" % (tx_report_file,
                                                                                     dd.get_sample_name(data))
            ropts = " ".join(str(x) for x in
                             config_utils.get_resources("atropos", data["config"]).get("options", []))
            extra_opts = []
            for k, alt_ks, v, want in [("--quality-cutoff", ["-q "], "5", True),
                                       ("--minimum-length", ["-m "], str(dd.get_min_read_length(data)), True),
                                       ("--nextseq-trim", [], "25", ("polyx" in dd.get_adapters(data) or
                                                                     "polyg" in dd.get_adapters(data)))]:
                if k not in ropts and not any(alt_k in ropts for alt_k in alt_ks):
                    if want:
                        extra_opts.append("%s=%s" % (k, v))
            extra_opts = " ".join(extra_opts)
            thread_args = ("--threads %s" % cores if cores > 1 else "")
            cmd = ("atropos trim {ropts} {thread_args} --quality-base {quality_base} --format fastq "
                   "{adapters_args} {input_args} {output_args} {report_args} {extra_opts}")
            do.run(cmd.format(**locals()), "Trimming with atropos: %s" % dd.get_sample_name(data))
    return out_files, report_file
Example #5
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions, "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        cmd = ["goleft", "depth", "--q", "1", "--mincov", str(params["min"]),
               "--processes", str(dd.get_num_cores(data)), "--ordered"]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                bam_ref_file = "%s-bamref.fa" % utils.splitext_plus(bam_file)[0]
                bam.fai_from_bam(dd.get_ref_file(data), bam_file, bam_ref_file + ".fai", data)
                cmd += ["--reference", bam_ref_file]
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions, data)
    return depth_file, final_callable, _extract_highdepth(final_callable, data), variant_regions_avg_cov
Example #6
0
def run(calls, data):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + [
        "--sample",
        dd.get_sample_name(data),
        "--reference",
        dd.get_ref_file(data),
        "--bam",
        dd.get_align_bam(data),
        "--outdir",
        work_dir,
    ]
    available_callers = 0
    for call in calls:
        if call["variantcaller"] in SUPPORTED:
            available_callers += 1
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if available_callers >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(
                dd.get_align_bam(data), os.path.join(tx_work_dir, "insert-stats.yaml")
            )
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--boost_ins", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        calls.append({"variantcaller": "metasv", "vrn_file": out_file})
    return calls
Example #7
0
def priority_total_coverage(data, out_dir):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    from bcbio.structural import prioritize
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file) or prioritize.is_gene_list(bed_file):
        return {}
    work_dir = safe_makedir(out_dir)
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        # data['priority_total_coverage'] = os.path.abspath(out_file)
        return out_file
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = clean_file(bed_file, data)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    # data['priority_total_coverage'] = os.path.abspath(out_file)
    return out_file
Example #8
0
def _run_cobalt(paired, work_dir):
    """Run Cobalt for counting read depth across genomic windows.

    PURPLE requires even 1000bp windows so use integrated counting solution
    directly rather than converting from CNVkit calculations. If this approach
    is useful should be moved upstream to be available to other tools as
    an input comparison.

    https://github.com/hartwigmedical/hmftools/tree/master/count-bam-lines
    """
    cobalt_dir = utils.safe_makedir(os.path.join(work_dir, "cobalt"))
    out_file = os.path.join(cobalt_dir, "%s.cobalt" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            cmd = ["COBALT"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-reference", paired.normal_name, "-reference_bam", paired.normal_bam,
                   "-tumor", paired.tumor_name, "-tumor_bam", paired.tumor_bam,
                   "-threads", dd.get_num_cores(paired.tumor_data),
                   "-output_dir", os.path.dirname(tx_out_file),
                   "-gc_profile", dd.get_variation_resources(paired.tumor_data)["gc_profile"]]
            cmd = "%s && %s" % (utils.get_R_exports(), " ".join([str(x) for x in cmd]))
            do.run(cmd, "PURPLE: COBALT read depth normalization")
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(cobalt_dir, f))
    return out_file
Example #9
0
def priority_total_coverage(data):
    """
    calculate coverage at 10 depth intervals in the priority regions
    """
    bed_file = dd.get_svprioritize(data)
    if not bed_file and not file_exists(bed_file):
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data
    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with tx_tmpdir(data, work_dir) as tmp_dir:
        cleaned_bed = os.path.join(tmp_dir, os.path.basename(bed_file))
        cleaned_bed = bed.decomment(bed_file, cleaned_bed)
        with file_transaction(out_file) as tx_out_file:
            cmd = ("{sambamba} depth region -t {nthreads} -L {cleaned_bed} "
                   "-F \"not unmapped\" "
                   "-T 10 -T 20 -T 30 -T 40 -T 50 -T 60 -T 70 -T 80 -T 90 -T 100 "
                   "{in_bam} -o {tx_out_file}")
            message = "Calculating coverage of {bed_file} regions in {in_bam}"
            do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Example #10
0
def combine_variant_files(orig_files, out_file, ref_file, config,
                          quiet_out=True, region=None):
    """Combine VCF files from the same sample into a single output file.

    Handles cases where we split files into SNPs/Indels for processing then
    need to merge back into a final file.
    """
    in_pipeline = False
    if isinstance(orig_files, dict):
        file_key = config["file_key"]
        in_pipeline = True
        orig_files = orig_files[file_key]
    if not utils.file_exists(out_file):
        with file_transaction(config, out_file) as tx_out_file:
            exist_files = [x for x in orig_files if os.path.exists(x)]
            ready_files = run_multicore(p_bgzip_and_index, [[x, config] for x in exist_files], config)
            dict_file = "%s.dict" % utils.splitext_plus(ref_file)[0]
            cores = dd.get_num_cores({"config": config})
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            cmd = ["picard"] + broad.get_picard_opts(config, memscale) + \
                  ["MergeVcfs", "D=%s" % dict_file, "O=%s" % tx_out_file] + \
                  ["I=%s" % f for f in ready_files]
            cmd = "%s && %s" % (utils.get_java_clprep(), " ".join(cmd))
            do.run(cmd, "Combine variant files")
    if out_file.endswith(".gz"):
        bgzip_and_index(out_file, config)
    if in_pipeline:
        return [{file_key: out_file, "region": region, "sam_ref": ref_file, "config": config}]
    else:
        return out_file
Example #11
0
def kallisto_rnaseq(fq1, fq2, kallisto_dir, gtf_file, fasta_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(kallisto_dir, "quant")
    safe_makedir(kallisto_dir)
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    kallisto = config_utils.get_program("kallisto", dd.get_config(data))
    index = kallisto_index(gtf_file, fasta_file, data, os.path.dirname(kallisto_dir))
    fusion_flag = "--fusion" if dd.get_fusion_mode(data) or dd.get_fusion_caller(data) else ""
    single_flag = "--single" if not fq2 else ""
    fraglength_flag = "--fragment-length=200" if not fq2 else ""
    sd_flag = "--sd=25" if not fq2 else ""
    bootstrap_flag = "--bootstrap-samples=30"
    fq2 = "" if not fq2 else fq2
    if not fq2:
        logger.warning("kallisto was run on single-end data and we set the "
          "estimated fragment length to 200 and the standard "
          "deviation to 25, if these don't reflect your data then "
          "the results may be inaccurate. Use with caution. See "
          "https://groups.google.com/forum/#!topic/kallisto-sleuth-users/h5LeAlWS33w "
          "for details.")
    cmd = ("{kallisto} quant {fusion_flag} -t {num_cores} {single_flag} "
           "{fraglength_flag} {sd_flag} {bootstrap_flag} "
           "-o {tx_out_dir} -i {index} {fq1} {fq2}")
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts with kallisto.")
        do.run(cmd.format(**locals()), message, None)
    return quant_dir
Example #12
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    sambamba = config_utils.get_program("sambamba", data["config"])
    work_dir = safe_makedir(os.path.join(dd.get_work_dir(data), "report", "coverage"))
    if not bed_file:
        return data
    cleaned_bed = os.path.join(work_dir, os.path.splitext(os.path.basename(bed_file))[0] + ".cleaned.bed")
    cleaned_bed = bed.decomment(bed_file, cleaned_bed)

    with chdir(work_dir):
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with tx_tmpdir(data, work_dir) as tmp_dir:
                with file_transaction(parse_file) as out_tx:
                    cmd = ("{sambamba} depth region -F \"not unmapped\" -t {cores} "
                           "%s -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 "
                           "-T 80 -T 100 -L {cleaned_bed} {in_bam} | sed 's/# "
                           "chrom/chrom/' > {out_tx}")
                    do.run(cmd.format(**locals()) % "-C 1000", "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, cleaned_bed,  sample)
        _calculate_percentiles(os.path.abspath(parse_file), sample)
        data['coverage'] = os.path.abspath(parse_file)
    return data
Example #13
0
def filter_multimappers(align_file, data):
    """
    It does not seem like bowtie2 has a corollary to the -m 1 flag in bowtie,
    there are some options that are close but don't do the same thing. Bowtie2
    sets the XS flag for reads mapping in more than one place, so we can just
    filter on that. This will not work for other aligners.
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    if file_exists(out_file):
        return out_file
    base_filter = '-F "[XS] == null and not unmapped {paired_filter}"'
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    return out_file
Example #14
0
def _run_somatic(paired, ref_file, target, out_file):
    """Run somatic calling with octopus, handling both paired and tumor-only cases.

    Tweaks for low frequency, tumor only and UMI calling documented in:
    https://github.com/luntergroup/octopus/blob/develop/configs/UMI.config
    """
    align_bams = paired.tumor_bam
    if paired.normal_bam:
        align_bams += " %s --normal-sample %s" % (paired.normal_bam, paired.normal_name)
    cores = dd.get_num_cores(paired.tumor_data)
    # Do not try to search below 0.4% currently as leads to long runtimes
    # https://github.com/luntergroup/octopus/issues/29#issuecomment-428167979
    min_af = max([float(dd.get_min_allele_fraction(paired.tumor_data)) / 100.0, 0.004])
    min_af_floor = min_af / 4.0
    cmd = ("octopus --threads {cores} --reference {ref_file} --reads {align_bams} "
           "--regions-file {target} "
           "--min-credible-somatic-frequency {min_af_floor} --min-expected-somatic-frequency {min_af} "
           "--downsample-above 4000 --downsample-target 4000 --min-kmer-prune 5 --min-bubble-score 20 "
           "--max-haplotypes 200 --somatic-snv-mutation-rate '5e-4' --somatic-indel-mutation-rate '1e-05' "
           "--target-working-memory 5G --target-read-buffer-footprint 5G --max-somatic-haplotypes 3 "
           "--caller cancer "
           "--working-directory {tmp_dir} "
           "-o {tx_out_file} --legacy")
    if not paired.normal_bam:
        cmd += (" --tumour-germline-concentration 5")
    if dd.get_umi_type(paired.tumor_data) or _is_umi_consensus_bam(paired.tumor_bam):
        cmd += (" --allow-octopus-duplicates --overlap-masking 0 "
                "--somatic-filter-expression 'GQ < 200 | MQ < 30 | SB > 0.2 | SD[.25] > 0.1 "
                "| BQ < 40 | DP < 100 | MF > 0.1 | AD < 5 | CC > 1.1 | GQD > 2'")
    with file_transaction(paired.tumor_data, out_file) as tx_out_file:
        tmp_dir = os.path.dirname(tx_out_file)
        do.run(cmd.format(**locals()), "Octopus somatic calling")
        _produce_compatible_vcf(tx_out_file, paired.tumor_data, is_somatic=True)
    return out_file
Example #15
0
def filter_multimappers(align_file, data):
    """
    Filtering a BWA alignment file for uniquely mapped reads, from here:
    https://bioinformatics.stackexchange.com/questions/508/obtaining-uniquely-mapped-reads-from-bwa-mem-alignment
    """
    config = dd.get_config(data)
    type_flag = "" if bam.is_bam(align_file) else "S"
    base, ext = os.path.splitext(align_file)
    out_file = base + ".unique" + ext
    bed_file = dd.get_variant_regions(data) or dd.get_sample_callable(data)
    bed_cmd = '-L {0}'.format(bed_file) if bed_file else " "
    if utils.file_exists(out_file):
        return out_file
    base_filter = '-F "not unmapped {paired_filter} and not duplicate and [XA] == null and [SA] == null and not supplementary " '
    if bam.is_paired(align_file):
        paired_filter = "and paired and proper_pair"
    else:
        paired_filter = ""
    filter_string = base_filter.format(paired_filter=paired_filter)
    sambamba = config_utils.get_program("sambamba", config)
    num_cores = dd.get_num_cores(data)
    with file_transaction(out_file) as tx_out_file:
        cmd = ('{sambamba} view -h{type_flag} '
               '--nthreads {num_cores} '
               '-f bam {bed_cmd} '
               '{filter_string} '
               '{align_file} '
               '> {tx_out_file}')
        message = "Removing multimapped reads from %s." % align_file
        do.run(cmd.format(**locals()), message)
    bam.index(out_file, config)
    return out_file
Example #16
0
def annotate_with_depth(in_file, items):
    """Annotate called VCF file with depth using duphold (https://github.com/brentp/duphold)

    Currently annotates single sample and tumor samples in somatic analysis.
    """
    bam_file = None
    if len(items) == 1:
        bam_file = dd.get_align_bam(items[0])
    else:
        paired = vcfutils.get_paired(items)
        if paired:
            bam_file = paired.tumor_bam
    if bam_file:
        out_file = "%s-duphold.vcf.gz" % utils.splitext_plus(in_file)[0]
        if not utils.file_exists(out_file):
            with file_transaction(items[0], out_file) as tx_out_file:
                if not in_file.endswith(".gz"):
                    in_file = vcfutils.bgzip_and_index(in_file, remove_orig=False,
                                                       out_dir=os.path.dirname(tx_out_file))
                ref_file = dd.get_ref_file(items[0])
                # cores for BAM reader thread, so max out at 4 based on recommendations
                cores = min([dd.get_num_cores(items[0]), 4])
                cmd = ("duphold --threads {cores} --vcf {in_file} --bam {bam_file} --fasta {ref_file} "
                       "-o {tx_out_file}")
                do.run(cmd.format(**locals()), "Annotate SV depth with duphold")
        vcfutils.bgzip_and_index(out_file)
        return out_file
    else:
        return in_file
Example #17
0
def run(bam_file, data, out_dir):
    """Run viral QC analysis.
    """
    viral_target = "gdc-viral"
    out = {}
    if vcfutils.get_paired_phenotype(data):
        viral_refs = [x for x in dd.get_viral_files(data) if os.path.basename(x) == "%s.fa" % viral_target]
        if viral_refs and utils.file_exists(viral_refs[0]):
            viral_ref = viral_refs[0]
            viral_bam = os.path.join(utils.safe_makedir(out_dir),
                                     "%s-%s.bam" % (dd.get_sample_name(data),
                                                    utils.splitext_plus(os.path.basename(viral_ref))[0]))
            out_file = "%s-counts.txt" % utils.splitext_plus(viral_bam)[0]
            if not utils.file_uptodate(out_file, bam_file):
                if not utils.file_uptodate(viral_bam, bam_file):
                    with file_transaction(data, viral_bam) as tx_out_file:
                        cores = dd.get_num_cores(data)
                        tmpfile = "%s-tmp" % utils.splitext_plus(tx_out_file)[0]
                        cmd = ("samtools view -u -f 4 {bam_file} | "
                               "bamtofastq collate=0 | "
                               "bwa mem -t {cores} {viral_ref} - | "
                               "bamsort tmpfile={tmpfile} inputthreads={cores} outputthreads={cores} "
                               "inputformat=sam index=1 indexfilename={tx_out_file}.bai O={tx_out_file}")
                        do.run(cmd.format(**locals()), "Compare unmapped reads to viral genome")
                with file_transaction(data, out_file) as tx_out_file:
                    with open(tx_out_file, "w") as out_handle:
                        out_handle.write("# sample\t%s\n" % dd.get_sample_name(data))
                        for info in bam.idxstats(viral_bam, data):
                            if info.aligned > 0:
                                out_handle.write("%s\t%s\n" % (info.contig, info.aligned))
            out["base"] = out_file
    return out
Example #18
0
def sailfish(fq1, fq2, sailfish_dir, gtf_file, ref_file, strandedness, data):
    safe_makedir(sailfish_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(sailfish_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    sailfish_idx = sailfish_index(gtf_file, ref_file, data, sailfish_dir)
    num_cores = dd.get_num_cores(data)
    sailfish = config_utils.get_program("sailfish", data["config"])
    cmd = "{sailfish} quant -i {sailfish_idx} -p {num_cores} "
    cmd += _libtype_string(fq1, fq2, strandedness)
    fq1_cmd = "{fq1}" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2}" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
        cmd += "--useVBOpt --numBootstraps 30 "
    cmd += "-o {tx_out_dir}"
    message = "Quantifying transcripts in {fq1} and {fq2}."
    with file_transaction(data, sailfish_dir) as tx_out_dir:
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_file
Example #19
0
def rapmap_align(fq1, fq2, rapmap_dir, gtf_file, ref_file, algorithm, data):
    valid_algorithms = ["pseudo", "quasi"]
    assert algorithm in valid_algorithms, \
        "RapMap algorithm needs to be one of %s." % valid_algorithms
    safe_makedir(rapmap_dir)
    samplename = dd.get_sample_name(data)
    out_file = os.path.join(rapmap_dir, samplename + ".bam")
    if file_exists(out_file):
        return out_file
    rapmap_index_loc = rapmap_index(gtf_file, ref_file, algorithm, data,
                                    rapmap_dir)
    num_cores = dd.get_num_cores(data)
    algorithm_subcommand = algorithm + "map"
    rapmap = config_utils.get_program("rapmap", dd.get_config(data))
    cmd = "{rapmap} {algorithm_subcommand} -t {num_cores} -i {rapmap_index_loc} "
    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += "-r {fq1_cmd} "
    else:
        fq2_cmd = "{fq2} " if not is_gzipped(fq2) else "<(gzip -cd {fq2}) "
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += "-1 {fq2_cmd} -2 {fq2_cmd} "
    with file_transaction(out_file) as tx_out_file:
        cmd += "| " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        run_message = ("%smapping %s and %s to %s with Rapmap. "
                       % (algorithm, fq1, fq2, rapmap_index))
        do.run(cmd.format(**locals()), run_message, None)
    return out_file
Example #20
0
def _run_amber(paired, work_dir, lenient=False):
    """AMBER: calculate allele frequencies at likely heterozygous sites.

    lenient flag allows amber runs on small test sets.
    """
    amber_dir = utils.safe_makedir(os.path.join(work_dir, "amber"))
    out_file = os.path.join(amber_dir, "%s.amber.baf" % dd.get_sample_name(paired.tumor_data))
    if not utils.file_exists(out_file) or not utils.file_exists(out_file + ".pcf"):
        with file_transaction(paired.tumor_data, out_file) as tx_out_file:
            key = "germline_het_pon"
            het_bed = tz.get_in(["genome_resources", "variation", key], paired.tumor_data)
            cmd = ["AMBER"] + _get_jvm_opts(tx_out_file, paired.tumor_data) + \
                  ["-threads", dd.get_num_cores(paired.tumor_data),
                   "-tumor", dd.get_sample_name(paired.tumor_data),
                   "-tumor_bam", dd.get_align_bam(paired.tumor_data),
                   "-reference", dd.get_sample_name(paired.normal_data),
                   "-reference_bam", dd.get_align_bam(paired.normal_data),
                   "-ref_genome", dd.get_ref_file(paired.tumor_data),
                   "-bed", het_bed,
                   "-output_dir", os.path.dirname(tx_out_file)]
            if lenient:
                cmd += ["-max_het_af_percent", "1.0"]
            try:
                do.run(cmd, "PURPLE: AMBER baf generation")
            except subprocess.CalledProcessError as msg:
                if not lenient and _amber_allowed_errors(str(msg)):
                    return _run_amber(paired, work_dir, True)
            for f in os.listdir(os.path.dirname(tx_out_file)):
                if f != os.path.basename(tx_out_file):
                    shutil.move(os.path.join(os.path.dirname(tx_out_file), f),
                                os.path.join(amber_dir, f))
    return out_file
Example #21
0
def gatk_rnaseq_calling(data):
    """
    use GATK to perform variant calling on RNA-seq data
    """
    broad_runner = broad.runner_from_config(dd.get_config(data))
    ref_file = dd.get_ref_file(data)
    split_bam = dd.get_split_bam(data)
    out_file = os.path.splitext(split_bam)[0] + ".gvcf"
    num_cores = dd.get_num_cores(data)
    if file_exists(out_file):
        data = dd.set_vrn_file(data, out_file)
        return data
    with file_transaction(out_file) as tx_out_file:
        params = ["-T", "HaplotypeCaller",
                  "-R", ref_file,
                  "-I", split_bam,
                  "-o", tx_out_file,
                  "-nct", str(num_cores),
                  "--emitRefConfidence", "GVCF",
                  "--variant_index_type", "LINEAR",
                  "--variant_index_parameter", "128000",
                  "-dontUseSoftClippedBases",
                  "-stand_call_conf", "20.0",
                  "-stand_emit_conf", "20.0"]
        broad_runner.run_gatk(params)
    data = dd.set_vrn_file(data, out_file)
    return data
Example #22
0
def run(vcf, conf_fns, lua_fns, data, basepath=None, decomposed=False):
    """Annotate a VCF file using vcfanno (https://github.com/brentp/vcfanno)

    decomposed -- if set to true we'll convert allele based output into single values
      to match alleles and make compatible with vcf2db
      (https://github.com/quinlan-lab/vcf2db/issues/14)
    """
    conf_fns.sort(key=lambda x: os.path.basename(x) if x else "")
    lua_fns.sort(key=lambda x: os.path.basename(x) if x else "")
    ext = "-annotated-%s" % utils.splitext_plus(os.path.basename(conf_fns[0]))[0]
    if vcf.find(ext) > 0:
        out_file = vcf
    else:
        out_file = "%s%s.vcf.gz" % (utils.splitext_plus(vcf)[0], ext)
    if not utils.file_exists(out_file):
        vcfanno = config_utils.get_program("vcfanno", data)
        with file_transaction(out_file) as tx_out_file:
            conffn = _combine_files(conf_fns, out_file, data, basepath is None)
            luafn = _combine_files(lua_fns, out_file, data, False)
            luaflag = "-lua {0}".format(luafn) if luafn and utils.file_exists(luafn) else ""
            basepathflag = "-base-path {0}".format(basepath) if basepath else ""
            cores = dd.get_num_cores(data)
            post_ann = "sed -e 's/Number=A/Number=1/g' |" if decomposed else ""
            cmd = ("{vcfanno} -p {cores} {luaflag} {basepathflag} {conffn} {vcf} "
                   "| {post_ann} bgzip -c > {tx_out_file}")
            message = "Annotating {vcf} with vcfanno, using {conffn}".format(**locals())
            do.run(cmd.format(**locals()), message)
    return vcfutils.bgzip_and_index(out_file, data["config"])
Example #23
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", sailfish.get_build_string(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if dd.get_transcriptome_fasta(data):
        gtf_fa = dd.get_transcriptome_fasta(data)
    else:
        gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    assert file_exists(gtf_fa), "%s was not found, exiting." % gtf_fa
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    files = dd.get_input_sequence_files(data)
    readlength = bam.fastq.estimate_read_length(files[0])
    if readlength % 2 == 0:
        readlength -= 1
    kmersize = min(readlength, 31)
    with file_transaction(data, out_dir) as tx_out_dir:
        cmd = "{salmon} index -k {kmersize} -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #24
0
def _vardict_options_from_config(items, config, out_file, target=None, is_rnaseq=False):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    cores = dd.get_num_cores(items[0])
    if cores and cores > 1:
        opts += ["-th", str(cores)]
    # Disable SV calling for vardict, causes issues with regional analysis
    # by detecting SVs outside of target regions, which messes up merging
    # SV calling will be worked on as a separate step
    vardict_cl = get_vardict_command(items[0])
    version = programs.get_version_manifest(vardict_cl)
    if (vardict_cl and version and
        ((vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.5")) or
         (vardict_cl == "vardict" and LooseVersion(version) >= LooseVersion("2018.07.25")))):
        opts += ["--nosv"]
    if (vardict_cl and version and
         (vardict_cl == "vardict-java" and LooseVersion(version) >= LooseVersion("1.5.6"))):
        opts += ["--deldupvar"]
    # remove low mapping quality reads
    if not is_rnaseq:
        opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    return " ".join(opts), " ".join(var2vcf_opts)
Example #25
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(out_dir, "%s-report.json" % utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [os.path.join(out_dir, "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
                 for x in fastq_files]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += ["--cut_by_quality3", "--cut_mean_quality", "5",
                    "--length_required", str(dd.get_min_read_length(data)),
                    "--disable_quality_filtering"]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += ["--json", report_file, "--report_title", dd.get_sample_name(data)]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
def priority_total_coverage(data):
    """
    calculate coverage at depth 20 in the priority regions
    """
    bed_file = dd.get_priority_regions(data)
    if not bed_file:
        return data
    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    sample = dd.get_sample_name(data)
    out_file = os.path.join(work_dir, sample + "_priority_total_coverage.bed")
    if file_exists(out_file):
        data['priority_total_coverage'] = os.path.abspath(out_file)
        return data

    nthreads = dd.get_num_cores(data)
    in_bam = dd.get_work_bam(data)
    sambamba = config_utils.get_program("sambamba", data, default="sambamba")
    with file_transaction(out_file) as tx_out_file:
        cmd = ("{sambamba} depth region -t {nthreads} -L {bed_file} "
               "-F \"not unmapped\" "
               "-T 20 {in_bam} -o {tx_out_file}")
        message = "Calculating coverage of {bed_file} regions in {in_bam}"
        do.run(cmd.format(**locals()), message.format(**locals()))
    data['priority_total_coverage'] = os.path.abspath(out_file)
    return data
Example #27
0
def trim_adapters(data):
    fq1, fq2 = dd.get_input_sequence_files(data)
    skewer = config_utils.get_program("skewer", data, default="skewer")
    nthreads = dd.get_num_cores(data)
    samplename = dd.get_sample_name(data)
    out_dir = os.path.join(dd.get_work_dir(data), "trimmed", samplename)
    of1 = os.path.join(out_dir, samplename + "-trimmed-pair1.fastq.gz")
    of2 = os.path.join(out_dir, samplename + "-trimmed-pair2.fastq.gz")
    of2 = of2 if fq2 else None
    if fq1 and fq2:
        if file_exists(of1) and file_exists(of2):
            return of1, of2
    else:
        if file_exists(of1):
            return of1, None
    safe_makedir(out_dir)
    file_string = "{fq1} {fq2} " if fq2 else "{fq1} "
    fw_cmd = _fw_command(data)
    rv_cmd = _rv_command(data)
    mode = "tail" if not fq2 else "pe"
    cmd = ("{skewer} --min 25 --threads {nthreads} -q 5 "
           "{fw_cmd} "
           "{rv_cmd} "
           "-m {mode} "
           "--compress --output {out_stem} ") + file_string
    with file_transaction(out_dir) as tx_out_dir:
        safe_makedir(tx_out_dir)
        out_stem = os.path.join(tx_out_dir, samplename)
        message = "Trimming {fq1}, {fq2} with skewer.".format(**locals())
        do.run(cmd.format(**locals()), message)
    return of1, of2
Example #28
0
def coverage(data):
    """
    Calculate coverage at different completeness cutoff
    for region in coverage option.
    """
    bed_file = dd.get_coverage(data)
    if not bed_file:
        return data

    work_dir = os.path.join(dd.get_work_dir(data), "report", "coverage")
    with chdir(work_dir):
        in_bam = data['work_bam']
        sample = dd.get_sample_name(data)
        logger.debug("doing coverage for %s" % sample)
        parse_file = os.path.join(sample + "_coverage.bed")
        parse_total_file = os.path.join(sample + "_cov_total.tsv")
        cores = dd.get_num_cores(data)
        if not file_exists(parse_file):
            with file_transaction(parse_file) as out_tx:
                cmd = ("sambamba depth region -F \"not unmapped\" -t {cores} -C 1000 -T 1 -T 5 -T 10 -T 20 -T 40 -T 50 -T 60 -T 70 -T 80 -T 100 -L {bed_file}  {in_bam} | sed 's/# chrom/chrom/' > {parse_file}")
                do.run(cmd.format(**locals()), "Run coverage for {}".format(sample))
        parse_file = _add_high_covered_regions(parse_file, bed_file, sample)
        _calculate_percentiles(parse_file, sample)
        data['coverage'] = os.path.abspath(parse_file)
        return data
Example #29
0
def run(items):
    """Run MetaSV if we have enough supported callers, adding output to the set of calls.
    """
    assert len(items) == 1, "Expect one input to MetaSV ensemble calling"
    data = items[0]
    work_dir = _sv_workdir(data)
    out_file = os.path.join(work_dir, "variants.vcf.gz")
    cmd = _get_cmd() + ["--sample", dd.get_sample_name(data), "--reference", dd.get_ref_file(data),
                        "--bam", dd.get_align_bam(data), "--outdir", work_dir]
    methods = []
    for call in data.get("sv", []):
        if call["variantcaller"] in SUPPORTED and call["variantcaller"] not in methods:
            methods.append(call["variantcaller"])
            cmd += ["--%s_vcf" % call["variantcaller"], call.get("vcf_file", call["vrn_file"])]
    if len(methods) >= MIN_CALLERS:
        if not utils.file_exists(out_file):
            tx_work_dir = utils.safe_makedir(os.path.join(work_dir, "raw"))
            ins_stats = shared.calc_paired_insert_stats_save(dd.get_align_bam(data),
                                                             os.path.join(tx_work_dir, "insert-stats.yaml"))
            cmd += ["--workdir", tx_work_dir, "--num_threads", str(dd.get_num_cores(data))]
            cmd += ["--spades", utils.which("spades.py"), "--age", utils.which("age_align")]
            cmd += ["--assembly_max_tools=1", "--assembly_pad=500"]
            cmd += ["--boost_sc", "--isize_mean", ins_stats["mean"], "--isize_sd", ins_stats["std"]]
            do.run(cmd, "Combine variant calls with MetaSV")
        filters = ("(NUM_SVTOOLS = 1 && ABS(SVLEN)>50000) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_FLANK_PERCENT>80) || "
                   "(NUM_SVTOOLS = 1 && ABS(SVLEN)<4000 && BA_NUM_GOOD_REC=0) || "
                   "(ABS(SVLEN)<4000 && BA_NUM_GOOD_REC>2)")
        filter_file = vfilter.hard_w_expression(out_file, filters,
                                                data, name="ReassemblyStats", limit_regions=None)
        effects_vcf, _ = effects.add_to_vcf(filter_file, data, "snpeff")
        data["sv"].append({"variantcaller": "metasv",
                           "vrn_file": effects_vcf or filter_file})
    return [data]
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, dd.get_lane(data)) + ".bam"
    if file_exists(out_file):
        data = dd.set_work_bam(data, out_file)
        return data
    cmd = ("{hisat2} -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
           "{rg_flags} ")
    if paired:
        cmd += "-1 {fastq_file} -2 {pair_file} "
    else:
        cmd += "-U {fastq_file} "
    if dd.get_analysis(data).lower() == "smallrna-seq":
        cmd += "-k 1000 "
    # if assembling transcripts, set flags that cufflinks can use
    if dd.get_assemble_transcripts(data):
        cmd += "--dta-cufflinks "
    if dd.get_analysis(data) == "rna-seq":
        splicesites = os.path.join(os.path.dirname(gtf_file),
                                   "ref-transcripts-splicesites.txt")
        cmd += "--known-splicesite-infile {splicesites} "
    message = "Aligning %s and %s with hisat2." %(fastq_file, pair_file)
    with file_transaction(out_file) as tx_out_file:
        cmd += " | " + postalign.sam_to_sortbam_cl(data, tx_out_file)
        do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
Example #31
0
def variants(data):
    if "vrn_file" not in data:
        return data
    if not dd.get_coverage(data):
        return data

    in_vcf = data['vrn_file']
    sample = dd.get_sample_name(data)
    cg_file = os.path.join(sample + "_with-gc.vcf.gz")
    parse_file = os.path.join(sample + "_gc-depth-parse.tsv")
    qc_file = os.path.join(sample + "_bcbio_variants.txt")
    work_dir = os.path.join(dd.get_work_dir(data), "report", "variants")
    with chdir(work_dir):
        if file_exists(qc_file):
            return data
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        ref_file = dd.get_ref_file(data)
        assert ref_file, "Need the reference genome fasta file."
        bed_file = dd.get_variant_regions(data)
        in_bam = dd.get_align_bam(data) or dd.get_work_bam(data)
        num_cores = dd.get_num_cores(data)
        broad_runner = broad.runner_from_config_safe(data["config"])
        if in_bam and broad_runner and broad_runner.has_gatk():
            if not file_exists(cg_file):
                with file_transaction(cg_file) as tx_out:
                    params = [
                        "-T", "VariantAnnotator", "-R", ref_file, "-L",
                        bed_file, "-I", in_bam, "-A", "GCContent", "-A",
                        "Coverage", "--variant", in_vcf, "--out", tx_out
                    ]
                    broad_runner.run_gatk(params)
            cg_file = vcfutils.bgzip_and_index(cg_file, data["config"])

            if not file_exists(parse_file):
                with file_transaction(parse_file) as out_tx:
                    with open(out_tx, 'w') as out_handle:
                        print >> out_handle, "CG\tdepth\tsample"
                    cmd = (
                        "bcftools query -s {sample} -f '[%GC][\\t%DP][\\t%SAMPLE]\\n' -R "
                        "{bed_file} {cg_file} >> {out_tx}")
                    do.run(cmd.format(**locals()),
                           "Calculating GC content and depth for %s" % in_vcf)
                    logger.debug('parsing coverage: %s' % sample)
            if not file_exists(qc_file):
                # This files will be copied to final
                _summary_variants(parse_file, qc_file)
            if file_exists(qc_file) and file_exists(parse_file):
                os.remove(cg_file)
        return data
Example #32
0
def merge_bam_files(bam_files, work_dir, data, out_file=None, batch=None):
    """Merge multiple BAM files from a sample into a single BAM for processing.

    Checks system open file limit and merges in batches if necessary to avoid
    file handle limits.
    """
    out_file = _merge_outfile_fname(out_file, bam_files, work_dir, batch)
    if not utils.file_exists(out_file):
        if len(bam_files) == 1 and bam.bam_already_sorted(
                bam_files[0], data["config"], "coordinate"):
            with file_transaction(data, out_file) as tx_out_file:
                _create_merge_filelist(bam_files, tx_out_file, data["config"])
            out_file = bam_files[0]
            samtools = config_utils.get_program("samtools", data["config"])
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
        else:
            with tx_tmpdir(data) as tmpdir:
                with utils.chdir(tmpdir):
                    with file_transaction(data, out_file) as tx_out_file:
                        tx_bam_file_list = _create_merge_filelist(
                            bam_files, tx_out_file, data["config"])
                        samtools = config_utils.get_program(
                            "samtools", data["config"])
                        resources = config_utils.get_resources(
                            "samtools", data["config"])
                        num_cores = dd.get_num_cores(data)
                        # Aim for 3.5Gb/core memory for BAM merging
                        num_cores = config_utils.adjust_cores_to_mb_target(
                            3500, resources.get("memory", "2G"), num_cores)
                        max_mem = config_utils.adjust_memory(
                            resources.get("memory", "1G"), 2,
                            "decrease").upper()
                        if dd.get_mark_duplicates(data):
                            cmd = _biobambam_merge_dedup_maxcov(data)
                        else:
                            cmd = _biobambam_merge_maxcov(data)
                        do.run(
                            cmd.format(**locals()), "Merge bam files to %s" %
                            os.path.basename(out_file), None)
                        do.run(
                            '{} quickcheck -v {}'.format(
                                samtools, tx_out_file),
                            "Check for valid merged BAM")
            do.run('{} quickcheck -v {}'.format(samtools, out_file),
                   "Check for valid merged BAM after transfer")
            _finalize_merge(out_file, bam_files, data["config"])
    bam.index(out_file, data["config"])
    return out_file
Example #33
0
def filter_barcodes(data):
    fq1 = dd.get_input_sequence_files(data)[0]
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    correction = dd.get_cellular_barcode_correction(data)
    bc = get_cellular_barcodes(data)
    if not bc:
        return [[data]]
    bc1 = None
    bc2 = None
    bc3 = None
    umi_dir = os.path.join(dd.get_work_dir(data), "umis")
    if isinstance(bc, basestring):
        bc1 = bc
    if len(bc) == 1:
        bc1 = bc[0]
    if len(bc) > 1:
        bc1 = bc[0]
        bc2 = bc[1]
    if len(bc) == 3:
        bc3 = bc[2]
    out_base = dd.get_sample_name(data) + ".filtered.fq.gz"
    out_file = os.path.join(umi_dir, out_base)
    if file_exists(out_file):
        data["files"] = [out_file]
        return [[data]]

    ncores = dd.get_num_cores(data)
    cmd = "{umis} cb_filter --cores {ncores} "
    if bc1:
        cmd += "--bc1 {bc1} "
        if correction:
            cmd += "--nedit {correction} "
    if bc2:
        cmd += "--bc2 {bc2} "
    if bc3:
        cmd += "--bc3 {bc3} "

    fq1_cmd = "{fq1} " if not is_gzipped(fq1) else "<(gzip -cd {fq1}) "
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    cmd += "{fq1_cmd} | gzip > {tx_out_file}"

    sample_dir = os.path.join(umi_dir, dd.get_sample_name(data))
    safe_makedir(sample_dir)
    umis = config_utils.get_program("umis", data, default="umis")
    with file_transaction(out_file) as tx_out_file:
        message = "Filtering by cellular barcode."
        do.run(cmd.format(**locals()), message)
    data["files"] = [out_file]
    return [[data]]
def _gatk_apply_bqsr(data):
    """Parallel BQSR support for GATK4.

    Normalized qualities to 3 bin outputs at 10, 20 and 30 based on pipeline standard
    recommendations, which will help with output file sizes:
    https://github.com/CCDG/Pipeline-Standardization/blob/master/PipelineStandard.md#base-quality-score-binning-scheme
    https://github.com/gatk-workflows/broad-prod-wgs-germline-snps-indels/blob/5585cdf7877104f2c61b2720ddfe7235f2fad577/PairedEndSingleSampleWf.gatk4.0.wdl#L1081

    spark host and timeout settings help deal with runs on restricted systems
    where we encounter network and timeout errors
    """
    in_file = dd.get_align_bam(data) or dd.get_work_bam(data)
    out_file = os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data),
                            "%s-recal.bam" % utils.splitext_plus(os.path.basename(in_file))[0])
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            broad_runner = broad.runner_from_config(data["config"])
            gatk_type = broad_runner.gatk_type()
            cores = dd.get_num_cores(data)
            if gatk_type == "gatk4":
                resources = config_utils.get_resources("gatk-spark", data["config"])
                spark_opts = [str(x) for x in resources.get("options", [])]
                params = ["-T", "ApplyBQSRSpark",
                          "--input", in_file, "--output", tx_out_file, "--bqsr-recal-file", data["prep_recal"],
                          "--static-quantized-quals", "10", "--static-quantized-quals", "20",
                          "--static-quantized-quals", "30"]
                if spark_opts:
                    params += spark_opts
                else:
                    params += ["--spark-master", "local[%s]" % cores,
                               "--conf", "spark.local.dir=%s" % os.path.dirname(tx_out_file),
                               "--conf", "spark.driver.host=localhost", "--conf", "spark.network.timeout=800"]
                # Avoid problems with StreamClosedErrors on GATK 4.1+
                # https://github.com/bcbio/bcbio-nextgen/issues/2806#issuecomment-492504497
                params += ["--create-output-bam-index", "false"]
            else:
                params = ["-T", "PrintReads", "-R", dd.get_ref_file(data), "-I", in_file,
                          "-BQSR", data["prep_recal"], "-o", tx_out_file]
            # Avoid problems with intel deflater for GATK 3.8 and GATK4
            # https://github.com/bcbio/bcbio-nextgen/issues/2145#issuecomment-343095357
            if gatk_type == "gatk4":
                params += ["--jdk-deflater", "--jdk-inflater"]
            elif LooseVersion(broad_runner.gatk_major_version()) > LooseVersion("3.7"):
                params += ["-jdk_deflater", "-jdk_inflater"]
            memscale = {"magnitude": 0.9 * cores, "direction": "increase"} if cores > 1 else None
            broad_runner.run_gatk(params, os.path.dirname(tx_out_file), memscale=memscale,
                                  parallel_gc=True)
    bam.index(out_file, data["config"])
    return out_file
Example #35
0
def _rnaseq_qualimap_cmd(data, bam_file, out_dir, gtf_file=None, single_end=None, library="non-strand-specific"):
    """
    Create command lines for qualimap
    """
    config = data["config"]
    qualimap = config_utils.get_program("qualimap", config)
    resources = config_utils.get_resources("qualimap", config)
    num_cores = resources.get("cores", dd.get_num_cores(data))
    max_mem = config_utils.adjust_memory(resources.get("memory", "2G"),
                                         num_cores)
    export = "%s%s" % (utils.java_freetype_fix(), utils.local_path_export())
    cmd = ("unset DISPLAY && {export} {qualimap} rnaseq -outdir {out_dir} "
           "-a proportional -bam {bam_file} -p {library} "
           "-gtf {gtf_file} --java-mem-size={max_mem}").format(**locals())
    return cmd
Example #36
0
def cufflinks_merge(*samples):
    to_merge = filter_missing([dd.get_assembled_gtf(data) for data in
                            dd.sample_data_iterator(samples)])
    data = samples[0][0]
    bam_file = dd.get_work_bam(data)
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    out_dir = os.path.join(dd.get_work_dir(data), "assembly")
    num_cores = dd.get_num_cores(data)
    merged_gtf = cufflinks.merge(to_merge, ref_file, gtf_file, num_cores, samples[0][0])
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_assembled_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Example #37
0
def salmon_quant_reads(fq1, fq2, salmon_dir, gtf_file, ref_file, data):
    samplename = dd.get_sample_name(data)
    quant_dir = os.path.join(salmon_dir, "quant")
    safe_makedir(salmon_dir)
    out_file = os.path.join(quant_dir, "quant.sf")
    if file_exists(out_file):
        return out_file
    num_cores = dd.get_num_cores(data)
    strandedness = dd.get_strandedness(data).lower()
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    libtype = sailfish._libtype_string(fq1, fq2, strandedness)
    num_cores = dd.get_num_cores(data)
    index = salmon_index(gtf_file, ref_file, data, os.path.dirname(salmon_dir))
    resources = config_utils.get_resources("salmon", dd.get_config(data))
    params = ""
    if resources.get("options") is not None:
        params = " ".join([str(x) for x in resources.get("options", [])])
    cmd = ("{salmon} quant {libtype} -i {index} -p {num_cores} "
           "--gcBias "
           "-o {tx_out_dir} {params} ")
    fq1_cmd = "<(cat {fq1})" if not is_gzipped(fq1) else "<(gzip -cd {fq1})"
    fq1_cmd = fq1_cmd.format(fq1=fq1)
    if not fq2:
        cmd += " -r {fq1_cmd} "
    else:
        fq2_cmd = "<(cat {fq2})" if not is_gzipped(fq2) else "<(gzip -cd {fq2})"
        fq2_cmd = fq2_cmd.format(fq2=fq2)
        cmd += " -1 {fq1_cmd} -2 {fq2_cmd} "
    # skip --useVBOpt for now, it can cause segfaults
    cmd += "--numBootstraps 30 "
    with file_transaction(data, quant_dir) as tx_out_dir:
        message = ("Quantifying transcripts in %s and %s with Salmon."
                   %(fq1, fq2))
        do.run(cmd.format(**locals()), message, None)
        sailfish.sleuthify_sailfish(tx_out_dir)
    return out_file
Example #38
0
def _call_variants(example_dir, data, out_file):
    """Call variants from prepared pileup examples, creating tensorflow record file.
    """
    tf_out_file = "%s-tfrecord.gz" % utils.splitext_plus(out_file)[0]
    if not utils.file_exists(tf_out_file):
        with file_transaction(data, tf_out_file) as tx_out_file:
            cmd = [
                "dv_call_variants.py", "--cores",
                dd.get_num_cores(data), "--outfile", tx_out_file, "--examples",
                example_dir, "--sample",
                dd.get_sample_name(data)
            ]
            do.run(cmd,
                   "DeepVariant call_variants %s" % dd.get_sample_name(data))
    return tf_out_file
Example #39
0
def _run_with_memory_scaling(params, tx_out_file, data, ld_preload=False):
    num_cores = dd.get_num_cores(data)
    memscale = {
        "magnitude": 0.9 * num_cores,
        "direction": "increase"
    } if num_cores > 1 else None
    # Ignore tools_off: [gatk4], since it doesn't apply to GATK CNV calling
    config = utils.deepish_copy(data["config"])
    if "gatk4" in dd.get_tools_off({"config": config}):
        config["algorithm"]["tools_off"].remove("gatk4")
    broad_runner = broad.runner_from_config(config)
    broad_runner.run_gatk(params,
                          os.path.dirname(tx_out_file),
                          memscale=memscale,
                          ld_preload=ld_preload)
Example #40
0
def sailfish_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambiguate(data))
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = _create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    if file_exists(out_dir + "versionInfo.json"):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} -k 25"
        message = "Creating sailfish index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #41
0
def _do_run(paired):
    """Perform Battenberg caling with the paired dataset.

    This purposely does not use a temporary directory for the output
    since Battenberg does smart restarts.
    """
    work_dir = _sv_workdir(paired.tumor_data)
    out = _get_battenberg_out(paired, work_dir)
    ignore_file = os.path.join(work_dir, "ignore_chromosomes.txt")
    if len(_missing_files(out)) > 0:
        ref_file = dd.get_ref_file(paired.tumor_data)
        bat_datadir = os.path.normpath(
            os.path.join(os.path.dirname(ref_file), os.pardir, "battenberg"))
        ignore_file, gl_file = _make_ignore_file(
            work_dir, ref_file, ignore_file,
            os.path.join(bat_datadir, "impute", "impute_info.txt"))
        local_sitelib = os.path.join(
            install.get_defaults().get("tooldir", "/usr/local"), "lib", "R",
            "site-library")
        tumor_bam = paired.tumor_bam
        normal_bam = paired.normal_bam
        platform = dd.get_platform(paired.tumor_data)
        genome_build = paired.tumor_data["genome_build"]
        # scale cores to avoid over-using memory during imputation
        cores = max(1, int(dd.get_num_cores(paired.tumor_data) * 0.5))
        gender = {
            "male": "XY",
            "female": "XX",
            "unknown": "L"
        }.get(population.get_gender(paired.tumor_data))
        if gender == "L":
            gender_str = "-ge %s -gl %s" % (gender, gl_file)
        else:
            gender_str = "-ge %s" % (gender)
        r_export_cmd = utils.get_R_exports()
        cmd = (
            "export R_LIBS_USER={local_sitelib} && {r_export_cmd} && "
            "battenberg.pl -t {cores} -o {work_dir} -r {ref_file}.fai "
            "-tb {tumor_bam} -nb {normal_bam} -e {bat_datadir}/impute/impute_info.txt "
            "-u {bat_datadir}/1000genomesloci -c {bat_datadir}/probloci.txt "
            "-ig {ignore_file} {gender_str} "
            "-assembly {genome_build} -species Human -platform {platform}")
        do.run(cmd.format(**locals()), "Battenberg CNV calling")
    assert len(_missing_files(
        out)) == 0, "Missing Battenberg output: %s" % _missing_files(out)
    out["plot"] = _get_battenberg_out_plots(paired, work_dir)
    out["ignore"] = ignore_file
    return out
Example #42
0
def sailfish_index(gtf_file, ref_file, data, build):
    work_dir = dd.get_work_dir(data)
    out_dir = os.path.join(work_dir, "sailfish", "index", build)
    sailfish = config_utils.get_program("sailfish", data["config"])
    num_cores = dd.get_num_cores(data)
    gtf_fa = create_combined_fasta(data)
    if file_exists(os.path.join(out_dir, "versionInfo.json")):
        return out_dir
    with file_transaction(data, out_dir) as tx_out_dir:
        fq1, _ = dd.get_input_sequence_files(data)
        kmersize = pick_kmersize(fq1)
        cmd = ("{sailfish} index -p {num_cores} -t {gtf_fa} -o {tx_out_dir} "
               "-k {kmersize}")
        message = "Creating sailfish index for {gtf_fa} with {kmersize} bp kmers."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #43
0
def salmon_index(gtf_file, ref_file, data, out_dir):
    out_dir = os.path.join(out_dir, "index", dd.get_genome_build(data))
    if dd.get_disambiguate(data):
        out_dir = "-".join([out_dir] + dd.get_disambguate(data))
    salmon = config_utils.get_program("salmon", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    gtf_fa = sailfish.create_combined_fasta(data, out_dir)
    tmpdir = dd.get_tmp_dir(data)
    out_file = os.path.join(out_dir, "versionInfo.json")
    if file_exists(out_file):
        return out_dir
    with file_transaction(out_dir) as tx_out_dir:
        cmd = "{salmon} index -k 31 -p {num_cores} -i {tx_out_dir} -t {gtf_fa}"
        message = "Creating Salmon index for {gtf_fa}."
        do.run(cmd.format(**locals()), message.format(**locals()), None)
    return out_dir
Example #44
0
def filter_primary(bam_file, data):
    """Filter reads to primary only BAM.

    Removes:
      - not primary alignment (0x100) 256
      - supplementary alignment (0x800) 2048
    """
    stem, ext = os.path.splitext(bam_file)
    out_file = stem + ".primary" + ext
    if not utils.file_exists(out_file):
        with file_transaction(data, out_file) as tx_out_file:
            cores = dd.get_num_cores(data)
            cmd = ("samtools view -@ {cores} -F 2304 -b {bam_file} > {tx_out_file}")
            do.run(cmd.format(**locals()), ("Filtering primary alignments in %s." %
                                            os.path.basename(bam_file)))
    return out_file
Example #45
0
def stringtie_merge(*samples):
    to_merge = filter_missing(
        flatten([
            dd.get_assembled_gtf(data)
            for data in dd.sample_data_iterator(samples)
        ]))
    data = samples[0][0]
    ref_file = dd.get_sam_ref(data)
    gtf_file = dd.get_gtf_file(data)
    num_cores = dd.get_num_cores(data)
    merged_gtf = stringtie.merge(to_merge, ref_file, gtf_file, num_cores, data)
    updated_samples = []
    for data in dd.sample_data_iterator(samples):
        data = dd.set_merged_gtf(data, merged_gtf)
        updated_samples.append([data])
    return updated_samples
Example #46
0
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {
        "window_size": 5000,
        "parallel_window_size": 1e5,
        "min": dd.get_coverage_depth_min(data),
        "high_multiplier": 20
    }
    prefix = os.path.join(
        utils.safe_makedir(
            os.path.join(dd.get_work_dir(data), "align",
                         dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    depth_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file,
                                                   variant_regions,
                                                   "variant_regions")
    if not utils.file_uptodate(callable_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = [
            "goleft", "depth", "--q", "1", "--mincov",
            str(params["min"]), "--reference", ref_file, "--processes",
            str(dd.get_num_cores(data)), "--ordered"
        ]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, depth_file) as tx_depth_file:
            with utils.chdir(os.path.dirname(tx_depth_file)):
                tx_callable_file = tx_depth_file.replace(
                    ".depth.bed", ".callable.bed")
                prefix = tx_depth_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    final_callable = _subset_to_variant_regions(callable_file, variant_regions,
                                                data)
    return depth_file, final_callable, _extract_highdepth(
        final_callable, data), variant_regions_avg_cov
Example #47
0
def _run_titancna(cn_file, het_file, ploidy, num_clusters, work_dir, data):
    """Run titanCNA wrapper script on given ploidy and clusters.
    """
    sample = dd.get_sample_name(data)
    cores = dd.get_num_cores(data)
    export_cmd = utils.get_R_exports()
    ploidy_dir = utils.safe_makedir(os.path.join(work_dir, "run_ploidy%s" % ploidy))

    cluster_dir = "%s_cluster%02d" % (sample, num_clusters)
    out_dir = os.path.join(ploidy_dir, cluster_dir)
    if not utils.file_uptodate(out_dir + ".titan.txt", cn_file):
        with tx_tmpdir(data) as tmp_dir:
            with utils.chdir(tmp_dir):
                cmd = ("{export_cmd} && titanCNA.R --id {sample} --hetFile {het_file} --cnFile {cn_file} "
                       "--numClusters {num_clusters} --ploidy {ploidy} --numCores {cores} --outDir {tmp_dir} "
                       "--libdir None")
                chroms = ["'%s'" % c.name.replace("chr", "") for c in ref.file_contigs(dd.get_ref_file(data))
                          if chromhacks.is_autosomal_or_x(c.name)]
                if "'X'" not in chroms:
                    chroms += ["'X'"]
                # Use UCSC style naming for human builds to support BSgenome
                genome_build = ("hg19" if dd.get_genome_build(data) in ["GRCh37", "hg19"]
                                else dd.get_genome_build(data))
                cmd += """ --chrs "c(%s)" """ % ",".join(chroms)
                cmd += " --genomeBuild {genome_build}"
                if data["genome_build"] in ("hg19", "hg38"):
                    cmd += " --genomeStyle UCSC"
                if data["genome_build"] in ["hg38"]:
                    data_dir = os.path.normpath(os.path.join(
                        os.path.dirname(os.path.realpath(os.path.join(
                            os.path.dirname(utils.Rscript_cmd()), "titanCNA.R"))),
                        os.pardir, os.pardir, "data"))
                    cytoband_file = os.path.join(data_dir, "cytoBand_hg38.txt")
                    assert os.path.exists(cytoband_file), cytoband_file
                    cmd += " --cytobandFile %s" % cytoband_file
                # TitanCNA's model is influenced by the variance in read coverage data
                # and data type: set reasonable defaults for non-WGS runs
                # (see https://github.com/gavinha/TitanCNA/tree/master/scripts/R_scripts)
                if dd.get_coverage_interval(data) != "genome":
                    cmd += " --alphaK=2500 --alphaKHigh=2500"
                do.run(cmd.format(**locals()), "TitanCNA CNV detection: ploidy %s, cluster %s" % (ploidy, num_clusters))
            for fname in glob.glob(os.path.join(tmp_dir, cluster_dir + "*")):
                shutil.move(fname, ploidy_dir)
            if os.path.exists(os.path.join(tmp_dir, "Rplots.pdf")):
                shutil.move(os.path.join(tmp_dir, "Rplots.pdf"),
                            os.path.join(ploidy_dir, "%s.Rplots.pdf" % cluster_dir))
    return ploidy_dir
Example #48
0
def _run_rtg_eval(vrn_file, rm_file, rm_interval_file, base_dir, data):
    """Run evaluation of a caller against the truth set using rtg vcfeval.
    """
    out_dir = os.path.join(base_dir, "rtg")
    if not utils.file_exists(os.path.join(out_dir, "done")):
        if os.path.exists(out_dir):
            shutil.rmtree(out_dir)
        vrn_file, rm_file, interval_bed = _prepare_inputs(vrn_file, rm_file, rm_interval_file, base_dir, data)

        rtg_ref = tz.get_in(["reference", "rtg"], data)
        assert rtg_ref and os.path.exists(rtg_ref), ("Did not find rtg indexed reference file for validation:\n%s\n"
                                                     "Run bcbio_nextgen.py upgrade --data --aligners rtg" % rtg_ref)
        # handle CWL where we have a reference to a single file in the RTG directory
        if os.path.isfile(rtg_ref):
            rtg_ref = os.path.dirname(rtg_ref)

        # get core and memory usage from standard configuration
        threads = min(dd.get_num_cores(data), 6)
        resources = config_utils.get_resources("rtg", data["config"])
        memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms500m", "-Xmx1500m"]),
                                          {"algorithm": {"memory_adjust": {"magnitude": threads,
                                                                           "direction": "increase"}}})
        jvm_stack = [x for x in memory if x.startswith("-Xms")]
        jvm_mem = [x for x in memory if x.startswith("-Xmx")]
        jvm_stack = jvm_stack[0] if len(jvm_stack) > 0 else "-Xms500m"
        jvm_mem = jvm_mem[0].replace("-Xmx", "") if len(jvm_mem) > 0 else "3g"
        cmd = ["rtg", "vcfeval", "--threads", str(threads),
               "-b", rm_file, "--bed-regions", interval_bed,
               "-c", vrn_file, "-t", rtg_ref, "-o", out_dir]
        rm_samples = vcfutils.get_samples(rm_file)
        if len(rm_samples) > 1 and dd.get_sample_name(data) in rm_samples:
            cmd += ["--sample=%s" % dd.get_sample_name(data)]
        cmd += ["--vcf-score-field='%s'" % (_pick_best_quality_score(vrn_file))]
        mem_export = "%s export RTG_JAVA_OPTS='%s' && export RTG_MEM=%s" % (utils.local_path_export(),
                                                                            jvm_stack, jvm_mem)
        cmd = mem_export + " && " + " ".join(cmd)
        do.run(cmd, "Validate calls using rtg vcfeval", data)
    out = {"fp": os.path.join(out_dir, "fp.vcf.gz"),
           "fn": os.path.join(out_dir, "fn.vcf.gz")}
    tp_calls = os.path.join(out_dir, "tp.vcf.gz")
    tp_baseline = os.path.join(out_dir, "tp-baseline.vcf.gz")
    if os.path.exists(tp_baseline):
        out["tp"] = tp_baseline
        out["tp-calls"] = tp_calls
    else:
        out["tp"] = tp_calls
    return out
Example #49
0
def _vardict_options_from_config(items,
                                 config,
                                 out_file,
                                 target=None,
                                 is_rnaseq=False):
    var2vcf_opts = []
    opts = ["-c 1", "-S 2", "-E 3", "-g 4"]
    # ["-z", "-F", "-c", "1", "-S", "2", "-E", "3", "-g", "4", "-x", "0",
    #  "-k", "3", "-r", "4", "-m", "8"]
    cores = dd.get_num_cores(items[0])
    if cores and cores > 1:
        opts += ["-th", str(cores)]
    # Disable SV calling for vardict, causes issues with regional analysis
    # by detecting SVs outside of target regions, which messes up merging
    # SV calling will be worked on as a separate step
    # use tools_on: vardict_sv to turn sv calling in vardict on (experimental)
    tools_on = dd.get_in_samples(items, dd.get_tools_on)
    vardict_sv_on = tools_on and "vardict_sv" in tools_on
    vardict_cl = get_vardict_command(items[0])
    version = programs.get_version_manifest(vardict_cl)
    # turn off structural variants
    if ((vardict_cl and version and
         ((vardict_cl == "vardict-java"
           and LooseVersion(version) >= LooseVersion("1.5.5")) or
          (vardict_cl == "vardict"))) and not vardict_sv_on):
        opts += ["--nosv"]
    if (vardict_cl and version
            and (vardict_cl == "vardict-java"
                 and LooseVersion(version) >= LooseVersion("1.5.6"))):
        opts += ["--deldupvar"]
    # remove low mapping quality reads
    if not is_rnaseq:
        opts += ["-Q", "10"]
    # Remove QCfail reads, avoiding high depth repetitive regions
    opts += ["-F", "0x700"]
    resources = config_utils.get_resources("vardict", config)
    if resources.get("options"):
        opts += [str(x) for x in resources["options"]]
    resources = config_utils.get_resources("var2vcf", config)
    if resources.get("options"):
        var2vcf_opts += [str(x) for x in resources["options"]]
    if target and _is_bed_file(target):
        target = _enforce_max_region_size(target, items[0])
        opts += [target]  # this must be the last option
    _add_freq_options(config, opts, var2vcf_opts)
    return " ".join(opts), " ".join(var2vcf_opts)
Example #50
0
def identify(data):
    """Identify high depth regions in the alignment file for potential filtering.
    """
    high_multiplier = 20
    sample_size = int(1e6)
    high_percentage = 25.0
    min_coverage = 10
    window_size = 250
    work_bam, out_file, stats_file = _get_files(data)
    if not os.path.exists(out_file):
        cores = dd.get_num_cores(data)
        with file_transaction(data, out_file) as tx_out_file:
            tx_raw_file = "%s-raw%s" % utils.splitext_plus(tx_out_file)
            py_cl = os.path.join(os.path.dirname(sys.executable), "py")
            cmd = (
                "sambamba depth window -t {cores} -c {min_coverage} "
                "--window-size {window_size} {work_bam} "
                "| head -n {sample_size} "
                """| cut -f 5 | {py_cl} -l 'numpy.median([float(x) for x in l if not x.startswith("mean")])'"""
            )
            median_cov = float(
                subprocess.check_output(cmd.format(**locals()), shell=True))
            if not numpy.isnan(median_cov):
                high_thresh = int(high_multiplier * median_cov)
                cmd = (
                    "sambamba depth window -t {cores} -c {median_cov} "
                    "--window-size {window_size} -T {high_thresh} {work_bam} "
                    "| {py_cl} -fx 'float(x.split()[5]) >= {high_percentage} "
                    """if not x.startswith("#") else None' """
                    "| cut -f 1-3,7 > {tx_raw_file} ")
                do.run(cmd.format(**locals()),
                       "Identify high coverage regions")
                with open(stats_file, "w") as out_handle:
                    yaml.safe_dump({"median_cov": median_cov},
                                   out_handle,
                                   allow_unicode=False,
                                   default_flow_style=False)
            else:
                with open(tx_raw_file, "w") as out_handle:
                    out_handle.write("")
            if utils.file_exists(tx_raw_file):
                cmd = "bedtools merge -i {tx_raw_file} -c 4 -o distinct > {tx_out_file}"
                do.run(cmd.format(**locals()), "Clean up raw coverage file")
            else:
                shutil.move(tx_raw_file, tx_out_file)
    return out_file if os.path.exists(out_file) else None
Example #51
0
def remove_duplicates(in_bam, data):
    """
    remove duplicates from a duplicate marked BAM file
    """
    base, ext = os.path.splitext(in_bam)
    out_bam = base + "-noduplicates" + ext
    if utils.file_exists(out_bam):
        return out_bam
    num_cores = dd.get_num_cores(data)
    sambamba = config_utils.get_program("sambamba", data)
    with file_transaction(out_bam) as tx_out_bam:
        cmd = (f'{sambamba} view -h --nthreads {num_cores} -f bam -F "not duplicate" '
               f'{in_bam} > {tx_out_bam}')
        message = f"Removing duplicates from {in_bam}, saving as {out_bam}."
        do.run(cmd, message)
    index(out_bam, dd.get_config(data))
    return out_bam
Example #52
0
def has_nalignments(in_bam, n, data, filter=None):
    """
    does a BAM file has at least n alignments?
    """
    sambamba = config_utils.get_program("sambamba", dd.get_config(data))
    num_cores = dd.get_num_cores(data)
    if not filter:
        filter_string = ""
        message = f"Counting alignments in {in_bam}."
    else:
        filter_string = "--filter {filter}"
        message = f"Counting alignments in {in_bam} matching {filter}."
    cmd = f"{sambamba} view -f sam {filter_string} {in_bam} | head -{n} | wc -l"
    logger.info(message)
    result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE,
                            stderr=subprocess.PIPE)
    return int(result.stdout.decode().strip()) >= n
Example #53
0
def bqsr_table(data):
    """Generate recalibration tables as inputs to BQSR.
    """
    in_file = dd.get_align_bam(data)
    out_file = "%s-recal-table.txt" % utils.splitext_plus(in_file)[0]
    if not utils.file_uptodate(out_file, in_file):
        with file_transaction(data, out_file) as tx_out_file:
            assoc_files = dd.get_variation_resources(data)
            known = "-k %s" % (
                assoc_files.get("dbsnp")) if "dbsnp" in assoc_files else ""
            license = license_export(data)
            cores = dd.get_num_cores(data)
            ref_file = dd.get_ref_file(data)
            cmd = ("{license}sentieon driver -t {cores} -r {ref_file} "
                   "-i {in_file} --algo QualCal {known} {tx_out_file}")
            do.run(cmd.format(**locals()), "Sentieon QualCal generate table")
    return out_file
Example #54
0
def _run_break_point_inspector(data, variant_file, paired):
    output_vcf = "%s-%s.vcf.gz" % (utils.splitext_plus(variant_file)[0], "bpi")
    if not utils.file_exists(output_vcf):
        with file_transaction(data, output_vcf) as tx_output_vcf:
            cores = dd.get_num_cores(data)
            resources = config_utils.get_resources("break-point-inspector", data["config"])
            memory = config_utils.adjust_opts(resources.get("jvm_opts", ["-Xms1000m", "-Xmx2000m"]),
                                              {"algorithm": {"memory_adjust": {"magnitude": cores,
                                                                               "direction": "increase"}}})
            cmd = ["break-point-inspector"]
            cmd += memory
            cmd += ["-vcf", variant_file]
            if paired:
                cmd += ["-ref", paired.normal_bam, "-tumor", paired.tumor_bam]
            cmd += ["-output_vcf", tx_output_vcf]
            do.run(cmd, "Running Break Point Inspector for Manta SV calls")
    return output_vcf
Example #55
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    paired = True if pair_file else False
    hisat2 = config_utils.get_program("hisat2", data)
    num_cores = dd.get_num_cores(data)
    quality_flag = _get_quality_flag(data)
    stranded_flag = _get_stranded_flag(data, paired)
    rg_flags = _get_rg_flags(names)
    out_file = os.path.join(align_dir, "{0}-sort.bam".format(dd.get_sample_name(data)))
    if data.get("align_split"):
        final_file = out_file
        out_file, data = alignprep.setup_combine(final_file, data)
        fastq_file, pair_file = alignprep.split_namedpipe_cls(fastq_file, pair_file, data)
    else:
        final_file = None
    if not file_exists(out_file) and (final_file is None or not file_exists(final_file)):
        cmd = ("{hisat2} --new-summary -x {ref_file} -p {num_cores} {quality_flag} {stranded_flag} "
               "{rg_flags} ")
        if paired:
            cmd += "-1 {fastq_file} -2 {pair_file} "
        else:
            cmd += "-U {fastq_file} "
        if dd.get_analysis(data).lower() == "smallrna-seq":
            cmd += "-k 1000 "
        # if assembling transcripts, set flags that cufflinks/stringtie can use
        if dd.get_transcript_assembler(data):
            cmd += "--dta-cufflinks "
        if dd.get_analysis(data).lower() == "rna-seq":
            gtf_file = dd.get_gtf_file(data)
            splicesites = os.path.join(os.path.dirname(gtf_file),
                                       "ref-transcripts-splicesites.txt")
            if not file_exists(splicesites):
                splicesites = create_splicesites_file(gtf_file, align_dir, data)
            # empty splicesite files means there is no splicing, so skip this option    
            # if there is no splicing for this organism
            if file_exists(splicesites):
                cmd += "--known-splicesite-infile {splicesites} "
        # apply additional hisat2 options
        cmd += " ".join(_get_options_from_config(data))

        message = "Aligning %s and %s with hisat2." % (fastq_file, pair_file)
        with postalign.tobam_cl(data, out_file, pair_file is not None) as (tobam_cl, tx_out_file):
            cmd += " | " + tobam_cl
            do.run(cmd.format(**locals()), message)
    data = dd.set_work_bam(data, out_file)
    return data
Example #56
0
def _fastp_trim(fastq_files, adapters, out_dir, data):
    """Perform multicore trimming with fastp (https://github.com/OpenGene/fastp)
    """
    report_file = os.path.join(
        out_dir, "%s-report.json" %
        utils.splitext_plus(os.path.basename(fastq_files[0]))[0])
    out_files = [
        os.path.join(
            out_dir,
            "%s-trimmed.fq.gz" % utils.splitext_plus(os.path.basename(x))[0])
        for x in fastq_files
    ]
    if not utils.file_exists(out_files[0]):
        with file_transaction(data, *[report_file] + out_files) as tx_out:
            tx_report = tx_out[0]
            tx_out_files = tx_out[1:]
            cmd = ["fastp", "--thread", dd.get_num_cores(data)]
            if dd.get_quality_format(data).lower() == "illumina":
                cmd += ["--phred64"]
            for i, (inf, outf) in enumerate(zip(fastq_files, tx_out_files)):
                if i == 0:
                    cmd += ["-i", inf, "-o", outf]
                else:
                    cmd += ["-I", inf, "-O", outf]
            cmd += [
                "--cut_by_quality3", "--cut_mean_quality", "5",
                "--length_required",
                str(dd.get_min_read_length(data)),
                "--disable_quality_filtering"
            ]
            if "polyx" in dd.get_adapters(data):
                cmd += ["--trim_poly_x", "--poly_x_min_len", "8"]
            if "polyx" in dd.get_adapters(data) or "polyg" in dd.get_adapters(
                    data):
                cmd += ["--trim_poly_g", "--poly_g_min_len", "8"]
            for a in adapters:
                cmd += ["--adapter_sequence", a]
            if not adapters:
                cmd += ["--disable_adapter_trimming"]
            cmd += [
                "--json", report_file, "--report_title",
                dd.get_sample_name(data)
            ]
            do.run(cmd, "Trimming with fastp: %s" % dd.get_sample_name(data))
    return out_files, report_file
def calculate(bam_file, data):
    """Calculate coverage in parallel using samtools depth through goleft.

    samtools depth removes duplicates and secondary reads from the counts:
    if ( b->core.flag & (BAM_FUNMAP | BAM_FSECONDARY | BAM_FQCFAIL | BAM_FDUP) ) continue;
    """
    params = {"window_size": 5000, "parallel_window_size": 1e5, "min": dd.get_coverage_depth_min(data),
              "high_multiplier": 20}
    prefix = os.path.join(
        utils.safe_makedir(os.path.join(dd.get_work_dir(data), "align", dd.get_sample_name(data))),
        "%s-coverage" % (dd.get_sample_name(data)))
    out_file = prefix + ".depth.bed"
    callable_file = prefix + ".callable.bed"
    variant_regions = dd.get_variant_regions_merged(data)
    variant_regions_avg_cov = get_average_coverage(data, bam_file, variant_regions,
                                                   "variant_regions", file_prefix=prefix)
    if not utils.file_uptodate(out_file, bam_file):
        ref_file = dd.get_ref_file(data)
        cmd = ["goleft", "depth", "--windowsize", str(params["window_size"]), "--q", "1",
               "--mincov", str(params["min"]), "--reference", ref_file,
               "--processes", str(dd.get_num_cores(data)), "--stats", "--ordered"]
        window_file = "%s-tocalculate-windows.bed" % utils.splitext_plus(out_file)[0]
        if not utils.file_uptodate(window_file, bam_file):
            with file_transaction(data, window_file) as tx_out_file:
                if not variant_regions:
                    variant_regions = "%s-genome.bed" % utils.splitext_plus(tx_out_file)[0]
                    with open(variant_regions, "w") as out_handle:
                        for c in shared.get_noalt_contigs(data):
                            out_handle.write("%s\t%s\t%s\n" % (c.name, 0, c.size))
                pybedtools.BedTool().window_maker(w=params["parallel_window_size"],
                                                  b=pybedtools.BedTool(variant_regions)).saveas(tx_out_file)
        cmd += ["--bed", window_file]
        max_depth = _get_max_depth(variant_regions_avg_cov, params, data)
        if max_depth:
            cmd += ["--maxmeandepth", str(int(max_depth))]
        with file_transaction(data, out_file) as tx_out_file:
            with utils.chdir(os.path.dirname(tx_out_file)):
                tx_callable_file = tx_out_file.replace(".depth.bed", ".callable.bed")
                prefix = tx_out_file.replace(".depth.bed", "")
                cmd += ["--prefix", prefix, bam_file]
                bcbio_env = utils.get_bcbio_env()
                msg = "Calculate coverage: %s" % dd.get_sample_name(data)
                do.run(cmd, msg, env=bcbio_env)
                shutil.move(tx_callable_file, callable_file)
    return out_file, callable_file, _extract_highdepth(callable_file, data), variant_regions_avg_cov
Example #58
0
def align(fastq_file, pair_file, ref_file, names, align_dir, data):
    assert data["analysis"].lower().startswith(
        "wgbs-seq"), "No comparible alignment."
    config = data["config"]
    sample = dd.get_sample_name(data)
    out_prefix = os.path.join(align_dir, dd.get_lane(data))
    out_dir = os.path.join(align_dir, "%s_bismark" % dd.get_lane(data))

    if not ref_file:
        logger.error(
            "bismark index not found. We don't provide the STAR indexes "
            "by default because they are very large. You can install "
            "the index for your genome with: bcbio_nextgen.py upgrade "
            "--aligners bismark --genomes genome-build-name --data")
        sys.exit(1)

    final_out = os.path.join(align_dir, "{0}.bam".format(sample))
    if file_exists(final_out):
        data = dd.set_work_bam(data, final_out)
        data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
        return data

    bismark = config_utils.get_program("bismark", config)
    fastq_files = " ".join([fastq_file, pair_file
                            ]) if pair_file else fastq_file
    num_cores = dd.get_num_cores(data)
    n = 1 if num_cores < 5 else 2
    safe_makedir(align_dir)
    cmd = "{bismark} --bowtie2 --temp_dir {tx_out_dir} --gzip --multicore {n} -o {tx_out_dir} --unmapped {ref_file} {fastq_file}"
    if pair_file:
        fastq_file = "-1 %s -2 %s" % (fastq_file, pair_file)
    raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    if not raw_bam:
        with tx_tmpdir() as tx_out_dir:
            run_message = "Running Bismark aligner on %s and %s" % (fastq_file,
                                                                    ref_file)
            do.run(cmd.format(**locals()), run_message, None)
            shutil.move(tx_out_dir, out_dir)
        raw_bam = glob.glob(out_dir + "/*bismark*bt2*bam")
    process_bam = _process_bam(raw_bam[0], fastq_files, sample,
                               dd.get_sam_ref(data), config)
    utils.symlink_plus(process_bam, final_out)
    data = dd.set_work_bam(data, final_out)
    data["bam_report"] = glob.glob(os.path.join(out_dir, "*report.txt"))[0]
    return data
Example #59
0
def _bam_coverage(name, bam_input, data):
    """Run bamCoverage from deeptools"""
    cmd = ("{bam_coverage} -b {bam_input} -o {bw_output} "
           "--binSize 20 --effectiveGenomeSize {size} "
           "--smoothLength 60 --extendReads 150 --centerReads -p {cores}")
    size = int(get_genome(dd.get_genome_build(data)))
    cores = dd.get_num_cores(data)
    try:
        bam_coverage = config_utils.get_program("bamCoverage", data)
    except config_utils.CmdNotFound:
        logger.info("No bamCoverage found, skipping bamCoverage.")
        return None
    bw_output = os.path.join(os.path.dirname(bam_input), "%s.bw" % name)
    if utils.file_exists(bw_output):
        return bw_output
    with file_transaction(bw_output) as out_tx:
        do.run(cmd.format(**locals()), "Run bamCoverage in %s" % name)
    return bw_output
Example #60
0
def downsample(in_bam, data, target_counts, work_dir=None):
    """Downsample a BAM file to the specified number of target counts.
    """
    index(in_bam, data["config"], check_timestamp=False)
    ds_pct = get_downsample_pct(in_bam, target_counts, data)
    if ds_pct:
        out_file = "%s-downsample%s" % os.path.splitext(in_bam)
        if work_dir:
            out_file = os.path.join(work_dir, os.path.basename(out_file))
        if not utils.file_exists(out_file):
            with file_transaction(data, out_file) as tx_out_file:
                samtools = config_utils.get_program("samtools", data["config"])
                num_cores = dd.get_num_cores(data)
                ds_pct = "42." + "{ds_pct:.3}".format(ds_pct=ds_pct).replace("0.", "")
                cmd = ("{samtools} view -O BAM -@ {num_cores} -o {tx_out_file} "
                       "-s {ds_pct} {in_bam}")
                do.run(cmd.format(**locals()), "Downsample BAM file: %s" % os.path.basename(in_bam))
        return out_file