Example #1
0
def ref_genome_info(info, config, dirs):
    """Retrieve reference genome information from configuration variables.
    """
    genome_build = info.get("genome_build", None)
    (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"],
                                  dirs["galaxy"])
    return genome_build, sam_ref
Example #2
0
def convert_bam_to_fastq(in_file, work_dir, item, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment" or
         (isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method)):
        _, sam_ref = alignment.get_genome_ref(item["genome_build"], None, dirs["galaxy"])
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, sam_ref, out_bindir, config)

    out_files = [os.path.join(out_dir, "{0}_{1}.fastq".format(
                 os.path.splitext(os.path.basename(in_file))[0], x))
                 for x in ["1", "2"]]
    if _is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_config(config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Example #3
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome,
                            base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    if rm_genome:
        rm_genome = alignment.get_genome_ref(rm_genome, None, data["dirs"]["galaxy"])[-1]
        if rm_genome != data["sam_ref"]:
            eval_genome = data["sam_ref"]
        else:
            eval_genome = None
    else:
        eval_genome = None
        rm_genome = data["sam_ref"]
    ref_call = {"file": rm_file, "name": "ref", "type": "grading-ref",
                "preclean": True, "prep": True, "remove-refcalls": True}
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    if eval_genome:
        eval_call["ref"] = eval_genome
        eval_call["preclean"] = True
        eval_call["prep"] = True
    exp = {"sample": data["name"][-1],
           "ref": rm_genome,
           "approach": "grade",
           "calls": [ref_call, eval_call]}
    if data.get("callable_bam"):
        exp["align"] = data["callable_bam"]
        intervals = ensemble.get_analysis_intervals(data)
        if intervals:
            exp["intervals"] = os.path.abspath(intervals)
    return {"dir": {"base": base_dir, "out": "work", "prep": "work/prep"},
            "experiments": [exp]}
Example #4
0
def ref_genome_info(info, config, dirs):
    """Retrieve reference genome information from configuration variables.
    """
    genome_build = info.get("genome_build", None)
    (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"],
                                  dirs["galaxy"])
    return genome_build, sam_ref
Example #5
0
def convert_bam_to_fastq(in_file, work_dir, item, dirs, config):
    """Convert BAM input file into FASTQ files.
    """
    out_dir = safe_makedir(os.path.join(work_dir, "fastq_convert"))

    qual_bin_method = config["algorithm"].get("quality_bin")
    if (qual_bin_method == "prealignment"
            or (isinstance(qual_bin_method, list)
                and "prealignment" in qual_bin_method)):
        _, sam_ref = alignment.get_genome_ref(item["genome_build"], None,
                                              dirs["galaxy"])
        out_bindir = safe_makedir(os.path.join(out_dir, "qualbin"))
        in_file = cram.illumina_qual_bin(in_file, sam_ref, out_bindir, config)

    out_files = [
        os.path.join(
            out_dir, "{0}_{1}.fastq".format(
                os.path.splitext(os.path.basename(in_file))[0], x))
        for x in ["1", "2"]
    ]
    if _is_paired(in_file):
        out1, out2 = out_files
    else:
        out1 = out_files[0]
        out2 = None
    if not file_exists(out1):
        broad_runner = broad.runner_from_config(config)
        broad_runner.run_fn("picard_bam_to_fastq", in_file, out1, out2)
    if os.path.getsize(out2) == 0:
        out2 = None
    return [out1, out2]
Example #6
0
def process_sample(sample_name, fastq_files, info, bam_files, dirs,
                   config, config_file):
    """Finalize processing for a sample, potentially multiplexed.
    """
    config = _update_config_w_custom(config, info)

    genome_build = info.get("genome_build", None)
    (_, sam_ref) = get_genome_ref(genome_build, config["algorithm"]["aligner"],
                                  dirs["galaxy"])
    fastq1, fastq2 = combine_fastq_files(fastq_files, dirs["work"])
    log.info("Combining and preparing wig file %s" % str(sample_name))
    sort_bam = merge_bam_files(bam_files, dirs["work"], config)
    (gatk_bam, vrn_file, effects_file) = ("", "", "")
    if config["algorithm"]["recalibrate"]:
        log.info("Recalibrating %s with GATK" % str(sample_name))
        gatk_bam = recalibrate_quality(sort_bam, fastq1, fastq2, sam_ref,
                                       dirs, config)
        if config["algorithm"]["snpcall"]:
            log.info("SNP genotyping %s with GATK" % str(sample_name))
            vrn_file = run_genotyper(gatk_bam, sam_ref, config)
            log.info("Calculating variation effects for %s" % str(sample_name))
            effects_file = variation_effects(vrn_file, genome_build, config)
    if config["algorithm"].get("transcript_assemble", False):
        tx_file = assemble_transcripts(sort_bam, sam_ref, config)
    if sam_ref is not None:
        log.info("Generating summary files: %s" % str(sample_name))
        generate_align_summary(sort_bam, fastq2 is not None, sam_ref,
                               sample_name, config, dirs)
    bam_to_wig(sort_bam, config, config_file)
    return [sample_name, fastq_files, info, sort_bam, gatk_bam, vrn_file,
            effects_file]
Example #7
0
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc, dirs, config,
                    config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if fastq1 is None and "vrn_file" in info:
        _, ref_file = get_genome_ref(info["genome_build"], None,
                                     dirs["galaxy"])
        config["algorithm"]["variantcaller"] = ""
        data = {
            "info": info,
            "sam_ref": ref_file,
            "work_bam": None,
            "genome_build": info["genome_build"],
            "name": ("", lane_desc),
            "vrn_file": info["vrn_file"],
            "dirs": copy.deepcopy(dirs),
            "config": config
        }
    else:
        align_out = process_alignment(fastq1, fastq2, info, lane_name,
                                      lane_desc, dirs, config)[0]
        data = _organize_merge_samples(align_out, dirs, config_file)
        callable_region_bed, analysis_regions = callable.block_regions(
            data["work_bam"], data["sam_ref"], config)
        data["regions"] = analysis_regions
        if (os.path.exists(callable_region_bed)
                and not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"][
                "variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Example #8
0
def align_prep_full(fastq1, fastq2, info, lane_name, lane_desc,
                    dirs, config, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    if fastq1 is None and "vrn_file" in info:
        _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
        config["algorithm"]["variantcaller"] = ""
        data = {"info": info, "sam_ref": ref_file,
                "work_bam": None,
                "genome_build": info["genome_build"],
                "name": ("", lane_desc),
                "vrn_file": info["vrn_file"],
                "dirs": copy.deepcopy(dirs), "config": config}
    else:
        align_out = process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                                      dirs, config)[0]
        data = _organize_merge_samples(align_out, dirs, config_file)
        callable_region_bed, analysis_regions = callable.block_regions(data["work_bam"],
                                                                       data["sam_ref"], config)
        data["regions"] = analysis_regions
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Example #9
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                      dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    names = rg_names(lane_name, lane_desc, config)
    _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names,
                                              info["genome_build"], aligner,
                                              dirs, config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(dirs["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs, config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(dirs["work"], "prealign",
                                                         names["sample"]))
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    return [{"fastq": [fastq1, fastq2], "work_bam": out_bam, "info": info,
             "sam_ref": ref_file, "config": config}]
Example #10
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = data["files"]
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, data["rgnames"],
                                              data["genome_build"], aligner,
                                              data["dirs"], data["config"])
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        _check_prealigned_bam(fastq1, data["sam_ref"], config)
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    data["sam_ref"] = get_genome_ref(data["genome_build"], None, data["dirs"]["galaxy"])[-1]
    data["work_bam"] = out_bam
    return [[data]]
Example #11
0
def split_read_files(fastq1, fastq2, item, split_size, out_dir, dirs, config):
    """Split input reads for parallel processing, dispatching on input type.
    """
    if fastq1.endswith(".bam") and fastq2 is None:
        qual_bin_method = config["algorithm"].get("quality_bin")
        if qual_bin_method == "prealignment" or (
            isinstance(qual_bin_method, list) and "prealignment" in qual_bin_method
        ):
            _, sam_ref = alignment.get_genome_ref(item["genome_build"], None, dirs["galaxy"])
            out_bindir = utils.safe_makedir(os.path.join(out_dir, "qualbin"))
            fastq1 = cram.illumina_qual_bin(fastq1, sam_ref, out_bindir, config)
        return split_bam_file(fastq1, split_size, out_dir, config)
    else:
        return split_fastq_files(fastq1, fastq2, split_size, out_dir, config)
Example #12
0
def _create_validate_config(vrn_file, rm_file, rm_interval_file, rm_genome,
                            base_dir, data):
    """Create a bcbio.variation configuration input for validation.
    """
    if rm_genome:
        rm_genome = alignment.get_genome_ref(rm_genome, None,
                                             data["dirs"]["galaxy"])[-1]
        if rm_genome != data["sam_ref"]:
            eval_genome = data["sam_ref"]
        else:
            eval_genome = None
    else:
        eval_genome = None
        rm_genome = data["sam_ref"]
    ref_call = {
        "file": rm_file,
        "name": "ref",
        "type": "grading-ref",
        "preclean": True,
        "prep": True,
        "remove-refcalls": True
    }
    if rm_interval_file:
        ref_call["intervals"] = rm_interval_file
    eval_call = {"file": vrn_file, "name": "eval", "remove-refcalls": True}
    if eval_genome:
        eval_call["ref"] = eval_genome
        eval_call["preclean"] = True
        eval_call["prep"] = True
    exp = {
        "sample": data["name"][-1],
        "ref": rm_genome,
        "approach": "grade",
        "calls": [ref_call, eval_call]
    }
    if data.get("callable_bam"):
        exp["align"] = data["callable_bam"]
        intervals = ensemble.get_analysis_intervals(data)
        if intervals:
            exp["intervals"] = os.path.abspath(intervals)
    return {
        "dir": {
            "base": base_dir,
            "out": "work",
            "prep": "work/prep"
        },
        "experiments": [exp]
    }
Example #13
0
def align_prep_full(data, config_file):
    """Perform alignment and post-processing required on full BAM files.
    Prepare list of callable genome regions allowing subsequent parallelization.
    """
    _, ref_file = get_genome_ref(data["genome_build"], None, data["dirs"]["galaxy"])
    data["sam_ref"] = ref_file
    if data["files"][0] is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = ""
        data["work_bam"] = None
    else:
        data = process_alignment(data)[0][0]
        callable_region_bed, nblock_bed = callable.block_regions(data["work_bam"],
                                                                 data["sam_ref"], data["config"])
        data["regions"] = {"nblock": nblock_bed}
        if (os.path.exists(callable_region_bed) and
                not data["config"]["algorithm"].get("variant_regions")):
            data["config"]["algorithm"]["variant_regions"] = callable_region_bed
        data["callable_bam"] = data["work_bam"]
        data = _recal_no_markduplicates(data)
    return [data]
Example #14
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs,
                      config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    names = rg_names(lane_name, lane_desc, config)
    _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names,
                                              info["genome_build"], aligner,
                                              dirs, config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(
                dirs["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs,
                                           config)
        else:
            out_bam = link_bam_file(
                fastq1, os.path.join(dirs["work"], "prealign",
                                     names["sample"]))
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    return [{
        "fastq": [fastq1, fastq2],
        "work_bam": out_bam,
        "info": info,
        "sam_ref": ref_file,
        "config": config
    }]
Example #15
0
def main(input_path, genome, filter_file, read1, read2, filtered_reads, aligner, slurm_parameters):
    
    if filter_file is None:
        filter_file, _ = get_genome_ref(genome, aligner, os.path.normpath(REFERENCE_DIR))
    
    infiles = []
    if read1 is None:
        if os.path.isdir(input_path):
            pat = os.path.join(input_path,"*barcode","*_1_fastq.txt")
            for read1 in glob.glob(pat):
                read2 = read1.replace("_1_fastq.txt","_2_fastq.txt")
                if not os.path.exists(read2):
                    read2 = None
                infiles.append([read1,read2])
        elif os.path.isfile(input_path):
            if input_path.endswith("_1_fastq.txt"):
                read1 = input_path
                read2 = read1.replace("_1_fastq.txt","_2_fastq.txt")
                if not os.path.exists(read2):
                    read2 = None
            elif input_path.endswith("_2_fastq.txt"):
                read2 = input_path
                read1 = read1.replace("_2_fastq.txt","_1_fastq.txt")
                assert os.path.exists(read1), "ERROR: Could not find the first read file (expected %s)" % read1
            else:
                read1 = input_path
                read2 = None
            infiles.append([read1,read2])
    else:
        infiles.append([read1,read2])
    
    for read1, read2 in infiles:
        jobid = filter_files_job(read1, read2, filtered_reads,
                                 filter_file, aligner,
                                 slurm_parameters)
        print "Your job was submitted with jobid %s" % jobid