def run_sample(tumor, normal, work_dir, cromwell_dir, hla_fa):
    hla_fasta, hlas = prep_hla_ref(hla_fa, work_dir)
    hla_cromwell_dir = _get_cromwell_execution_dir(cromwell_dir, HLA_GLOB)
    sv_cromwell_dir = _get_cromwell_execution_dir(cromwell_dir, SV_GLOB)
    tumor_fastq, tumor_calls_orig = get_hla(tumor, hla_cromwell_dir, HLA_GLOB)
    tumor_bam = alignment.align_to_sort_bam(tumor_fastq, None, ALIGNER, get_data(tumor, hla_fasta, work_dir))["work_bam"]
    normal_fastq, normal_calls_orig = get_hla(normal, hla_cromwell_dir, HLA_GLOB)
    normal_bam = alignment.align_to_sort_bam(normal_fastq, None, ALIGNER, get_data(normal, hla_fasta, work_dir))["work_bam"]
    tumor_ploidy = prep_ploidy(work_dir, tumor, tumor_bam, sv_cromwell_dir, os.path.join(SV_GLOB, SVCALL_GLOB))
    tumor_calls = prep_hla(work_dir, tumor, normal_calls_orig, hlas, normal_bam, tumor_bam)
    normal_calls = prep_hla(work_dir, normal, normal_calls_orig, hlas, normal_bam, tumor_bam)

    bam_dir, normal_bam_ready = create_tumor_bamdir(tumor, tumor_bam, normal_bam, work_dir)
    out_dir = utils.safe_makedir(os.path.join(work_dir, tumor, "lohhla_out"))
    prep_bam_inputs(out_dir, tumor, tumor_calls, tumor_bam)
    prep_bam_inputs(out_dir, normal, normal_calls, normal_bam)
    lohhla_output = os.path.join(out_dir, "%s.%s.DNA.HLAlossPrediction_CI.xls" % (tumor, DEPTH_FILTER))
    cmd = ["Rscript", os.path.join(LOHHLA, "LOHHLAscript.R"),
           "--patientId", tumor, "--outputDir", out_dir,
           "--normalBAMfile", normal_bam_ready, "--BAMDir", bam_dir,
           "--hlaPath", normal_calls, "--HLAfastaLoc", hla_fasta,
           "--HLAexonLoc", os.path.join(LOHHLA, "data", "hla.dat"),
           "--CopyNumLoc", tumor_ploidy,
           "--mappingStep", "FALSE",
           "--minCoverageFilter", str(DEPTH_FILTER)]
    if not os.path.exists(lohhla_output):
        subprocess.check_call(cmd)
    compare_calls(tumor, lohhla_output, sv_cromwell_dir, os.path.join(SV_GLOB, SVCALL_GLOB))
Example #2
0
def process_alignment(fastq1, fastq2, genome_build, lane_name, sample, dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    if os.path.exists(fastq1) and aligner:
        log.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        align_to_sort_bam(fastq1, fastq2, genome_build, aligner,
                          lane_name, sample, dirs, config)
Example #3
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    fastq1, fastq2 = data["files"]
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, aligner, data)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        _check_prealigned_bam(fastq1, data["sam_ref"], config)
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    data["work_bam"] = out_bam
    return [[data]]
Example #4
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                      dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    names = rg_names(lane_name, lane_desc, config)
    _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names,
                                              info["genome_build"], aligner,
                                              dirs, config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(dirs["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs, config)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(dirs["work"], "prealign",
                                                         names["sample"]))
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    return [{"fastq": [fastq1, fastq2], "work_bam": out_bam, "info": info,
             "sam_ref": ref_file, "config": config}]
Example #5
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        log.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, info["genome_build"], aligner, lane_name, lane_desc, dirs, config)
    return [{"fastq": [fastq1, fastq2], "out_bam": out_bam, "info": info, "config": config}]
Example #6
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"], dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data), data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" % dd.get_sample_name(data))
    else:
        raise ValueError("Could not process input file from sample configuration. \n" +
                         fastq1 +
                         "\nIs the path to the file correct or is empty?\n" +
                         "If it is a fastq file (not pre-aligned BAM or CRAM), "
                         "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #7
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    if "files" not in data:
        fastq1, fastq2 = None, None
    elif len(data["files"]) == 2:
        fastq1, fastq2 = data["files"]
    else:
        assert len(data["files"]) == 1, data["files"]
        fastq1, fastq2 = data["files"][0], None
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s" % sort_method
                )
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(os.path.splitext(os.path.basename(fastq1))[0])
            )
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign", data["rgnames"]["sample"]))
        bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        data["work_bam"] = dedup_bam
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    else:
        raise ValueError("Could not process input file: %s" % fastq1)
    return [[data]]
Example #8
0
def process_alignment(data):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    if "files" not in data:
        fastq1, fastq2 = None, None
    elif len(data["files"]) == 2:
        fastq1, fastq2 = data["files"]
    else:
        assert len(data["files"]) == 1, data["files"]
        fastq1, fastq2 = data["files"][0], None
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError("Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                                 % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], data["sam_ref"], data["dirs"],
                                           config)
        elif sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(data["dirs"]["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = link_bam_file(fastq1, os.path.join(data["dirs"]["work"], "prealign",
                                                         data["rgnames"]["sample"]))
        bam.check_header(out_bam, data["rgnames"], data["sam_ref"], data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        data["work_bam"] = dedup_bam
    elif fastq1 and os.path.exists(fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    else:
        raise ValueError("Could not process input file: %s" % fastq1)
    return [[data]]
Example #9
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs,
                      config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam = align_to_sort_bam(fastq1, fastq2, info["genome_build"],
                                    aligner, lane_name, lane_desc, dirs,
                                    config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        out_bam = fastq1
    return [{
        "fastq": [fastq1, fastq2],
        "out_bam": out_bam,
        "info": info,
        "config": config
    }]
Example #10
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc,
                      dirs, config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, info["genome_build"], aligner,
                                              lane_name, lane_desc, dirs, config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(dirs["work"], "{}-sort.bam".format(
                os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method, out_file)
        else:
            out_bam = fastq1
    return [{"fastq": [fastq1, fastq2], "work_bam": out_bam, "info": info,
             "sam_ref": ref_file, "config": config}]
Example #11
0
def tiered_alignment(in_bam, tier_num, multi_mappers, extra_args,
                     genome_build, pair_stats,
                     work_dir, dirs, config):
    """Perform the alignment of non-mapped reads from previous tier.
    """
    nomap_fq1, nomap_fq2 = select_unaligned_read_pairs(in_bam, "tier{}".format(tier_num),
                                                       work_dir, config)
    if nomap_fq1 is not None:
        base_name = "{}-tier{}out".format(os.path.splitext(os.path.basename(in_bam))[0],
                                          tier_num)
        config = copy.deepcopy(config)
        dirs = copy.deepcopy(dirs)
        config["algorithm"]["bam_sort"] = "queryname"
        config["algorithm"]["multiple_mappers"] = multi_mappers
        config["algorithm"]["extra_align_args"] = ["-i", int(pair_stats["mean"]),
                                               int(pair_stats["std"])] + extra_args
        return align_to_sort_bam(nomap_fq1, nomap_fq2, genome_build, "novoalign",
                                 base_name, base_name,
                                 dirs, config, dir_ext=os.path.join("hydra", os.path.split(nomap_fq1)[0]))
    else:
        return None
Example #12
0
def process_alignment(fastq1, fastq2, info, lane_name, lane_desc, dirs,
                      config):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    aligner = config["algorithm"].get("aligner", None)
    out_bam = ""
    names = rg_names(lane_name, lane_desc, config)
    _, ref_file = get_genome_ref(info["genome_build"], None, dirs["galaxy"])
    if os.path.exists(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" % (lane_name, aligner))
        out_bam, ref_file = align_to_sort_bam(fastq1, fastq2, names,
                                              info["genome_build"], aligner,
                                              dirs, config)
    elif os.path.exists(fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if sort_method:
            runner = broad.runner_from_config(config)
            out_file = os.path.join(
                dirs["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        elif bamclean is True or bamclean == "picard":
            out_bam = cleanbam.picard_prep(fastq1, names, ref_file, dirs,
                                           config)
        else:
            out_bam = link_bam_file(
                fastq1, os.path.join(dirs["work"], "prealign",
                                     names["sample"]))
    if not out_bam and not os.path.exists(fastq1):
        raise ValueError("Could not find input file: %s" % fastq1)
    return [{
        "fastq": [fastq1, fastq2],
        "work_bam": out_bam,
        "info": info,
        "sam_ref": ref_file,
        "config": config
    }]
Example #13
0
def tiered_alignment(in_bam, tier_num, multi_mappers, extra_args,
                     genome_build, pair_stats,
                     work_dir, dirs, config):
    """Perform the alignment of non-mapped reads from previous tier.
    """
    nomap_fq1, nomap_fq2 = select_unaligned_read_pairs(in_bam, "tier{}".format(tier_num),
                                                       work_dir, config)
    if nomap_fq1 is not None:
        base_name = "{}-tier{}out".format(os.path.splitext(os.path.basename(in_bam))[0],
                                          tier_num)
        config = copy.deepcopy(config)
        dirs = copy.deepcopy(dirs)
        config["algorithm"]["bam_sort"] = "queryname"
        config["algorithm"]["multiple_mappers"] = multi_mappers
        config["algorithm"]["extra_align_args"] = ["-i", int(pair_stats["mean"]),
                                               int(pair_stats["std"])] + extra_args
        dirs["align"] = os.path.split(nomap_fq1)[0]
        return align_to_sort_bam(nomap_fq1, nomap_fq2, genome_build, "novoalign",
                                 base_name, base_name,
                                 dirs, config)
    else:
        return None
Example #14
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = cwlutils.normalize_missing(utils.to_single_data(data))
    data = cwlutils.unpack_tarballs(data, data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        if dd.get_correct_umis(data):
            data["work_bam"] = postalign.correct_umis(data)
        if dd.get_umi_consensus(data):
            data["umi_bam"] = dd.get_work_bam(data)
            if fastq2:
                f1, f2, avg_cov = postalign.umi_consensus(data)
                data["config"]["algorithm"]["rawumi_avg_cov"] = avg_cov
                del data["config"]["algorithm"]["umi_type"]
                data["config"]["algorithm"]["mark_duplicates"] = False
                data = align_to_sort_bam(f1, f2, aligner, data)
            else:
                raise ValueError(
                    "Single fastq input for UMI processing; fgbio needs paired reads: %s"
                    % dd.get_sample_name(data))
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            ref_file = dd.get_ref_file(data)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"], ref_file,
                                           data["dirs"], data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif bamclean == "remove_extracontigs":
            out_bam = cleanbam.remove_extracontigs(fastq1, data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            if not utils.file_exists(out_file):
                work_dir = utils.safe_makedir(
                    os.path.join(dd.get_work_dir(data), "bamclean",
                                 dd.get_sample_name(data)))
                out_file = os.path.join(
                    work_dir, "{}-sort.bam".format(dd.get_sample_name(data)))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = _link_bam_file(
                fastq1,
                os.path.join(dd.get_work_dir(data), "prealign",
                             dd.get_sample_name(data)), data)
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and not dd.get_aligner(data):
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    elif "kraken" in config["algorithm"]:  # kraken doesn's need bam
        pass
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]
Example #15
0
def process_alignment(data, alt_input=None):
    """Do an alignment of fastq files, preparing a sorted BAM output file.
    """
    data = utils.to_single_data(data)
    fastq1, fastq2 = dd.get_input_sequence_files(data)
    if alt_input:
        fastq1, fastq2 = alt_input
    config = data["config"]
    aligner = config["algorithm"].get("aligner", None)
    if fastq1 and objectstore.file_exists_or_remote(fastq1) and aligner:
        logger.info("Aligning lane %s with %s aligner" %
                    (data["rgnames"]["lane"], aligner))
        data = align_to_sort_bam(fastq1, fastq2, aligner, data)
        data = _add_supplemental_bams(data)
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".bam"):
        sort_method = config["algorithm"].get("bam_sort")
        bamclean = config["algorithm"].get("bam_clean")
        if bamclean is True or bamclean == "picard":
            if sort_method and sort_method != "coordinate":
                raise ValueError(
                    "Cannot specify `bam_clean: picard` with `bam_sort` other than coordinate: %s"
                    % sort_method)
            out_bam = cleanbam.picard_prep(fastq1, data["rgnames"],
                                           dd.get_ref_file(data), data["dirs"],
                                           data)
        elif bamclean == "fixrg":
            out_bam = cleanbam.fixrg(fastq1, data["rgnames"],
                                     dd.get_ref_file(data), data["dirs"], data)
        elif sort_method:
            runner = broad.runner_from_path("picard", config)
            out_file = os.path.join(
                data["dirs"]["work"], "{}-sort.bam".format(
                    os.path.splitext(os.path.basename(fastq1))[0]))
            out_bam = runner.run_fn("picard_sort", fastq1, sort_method,
                                    out_file)
        else:
            out_bam = link_bam_file(
                fastq1,
                os.path.join(data["dirs"]["work"], "prealign",
                             data["rgnames"]["sample"]))
        bam.index(out_bam, data["config"])
        bam.check_header(out_bam, data["rgnames"], dd.get_ref_file(data),
                         data["config"])
        dedup_bam = postalign.dedup_bam(out_bam, data)
        bam.index(dedup_bam, data["config"])
        data["work_bam"] = dedup_bam
    elif fastq1 and objectstore.file_exists_or_remote(
            fastq1) and fastq1.endswith(".cram"):
        data["work_bam"] = fastq1
    elif fastq1 is None and "vrn_file" in data:
        data["config"]["algorithm"]["variantcaller"] = False
        data["work_bam"] = None
    elif not fastq1:
        raise ValueError("No 'files' specified for input sample: %s" %
                         dd.get_sample_name(data))
    else:
        raise ValueError(
            "Could not process input file from sample configuration. \n" +
            fastq1 + "\nIs the path to the file correct or is empty?\n" +
            "If it is a fastq file (not pre-aligned BAM or CRAM), "
            "is an aligner specified in the input configuration?")
    if data.get("work_bam"):
        # Add stable 'align_bam' target to use for retrieving raw alignment
        data["align_bam"] = data["work_bam"]
        data = _add_hla_files(data)
    return [[data]]