Exemple #1
0
def vc_output_record(samples):
    """Prepare output record from variant calling to feed into downstream analysis.

    Prep work handles reformatting so we return generated dictionaries.

    For any shared keys that are calculated only once for a batch, like variant calls
    for the batch, we assign to every sample.
    """
    shared_keys = [["vrn_file"], ["validate", "summary"], ["validate", "tp"],
                   ["validate", "fp"], ["validate", "fn"]]
    raw = cwlutils.samples_to_records(
        [utils.to_single_data(x) for x in samples])
    shared = {}
    for key in shared_keys:
        cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
        if len(cur) > 0:
            assert len(cur) == 1, (key, cur)
            shared[tuple(key)] = cur[0]
        else:
            shared[tuple(key)] = None
    out = []
    for d in raw:
        for key, val in shared.items():
            d = tz.update_in(d, key, lambda x: val)
        out.append([d])
    return out
Exemple #2
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples,
                                                       require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)
    return batches + extras
Exemple #3
0
def vc_output_record(samples):
    """Prepare output record from variant calling to feed into downstream analysis.

    Prep work handles reformatting so we return generated dictionaries.

    For any shared keys that are calculated only once for a batch, like variant calls
    for the batch, we assign to every sample.
    """
    shared_keys = [["vrn_file"], ["validate", "summary"],
                   ["validate", "tp"], ["validate", "fp"], ["validate", "fn"]]
    raw = cwlutils.samples_to_records([utils.to_single_data(x) for x in samples])
    shared = {}
    for key in shared_keys:
        cur = list(set([x for x in [tz.get_in(key, d) for d in raw] if x]))
        if len(cur) > 0:
            assert len(cur) == 1, (key, cur)
            shared[tuple(key)] = cur[0]
        else:
            shared[tuple(key)] = None
    out = []
    for d in raw:
        for key, val in shared.items():
            d = tz.update_in(d, key, lambda x: val)
        out.append([d])
    return out
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.

    If doing joint calling, with `tools_on: [gvcf]`, split the sample into
    individuals instead of combining into a batch.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    batches = []
    for cur_group in batch_groups.values():
        joint_calling = any([is_joint(d) for d in cur_group])
        if joint_calling:
            for d in cur_group:
                batches.append([d])
        else:
            batches.append(cur_group)
    return batches + extras
Exemple #5
0
def qc_to_rec(samples):
    """CWL: Convert a set of input samples into records for parallelization.
    """
    samples = [utils.to_single_data(x) for x in samples]
    samples = cwlutils.assign_complex_to_samples(samples)
    to_analyze, extras = _split_samples_by_qc(samples)
    recs = cwlutils.samples_to_records([utils.to_single_data(x) for x in to_analyze + extras])
    return [[x] for x in recs]
def qc_to_rec(samples):
    """CWL: Convert a set of input samples into records for parallelization.
    """
    samples = [utils.to_single_data(x) for x in samples]
    samples = cwlutils.assign_complex_to_samples(samples)
    to_analyze, extras = _split_samples_by_qc(samples)
    recs = cwlutils.samples_to_records([utils.to_single_data(x) for x in to_analyze + extras])
    return [[x] for x in recs]
Exemple #7
0
def batch_for_sv(samples):
    """Prepare a set of samples for parallel structural variant calling.

    CWL input target -- groups samples into batches and structural variant
    callers for parallel processing.
    """
    to_process, extras, background = _batch_split_by_sv(samples, "standard")
    out = [cwlutils.samples_to_records(xs) for xs in to_process.values()] + extras
    return out
Exemple #8
0
def batch_for_sv(samples):
    """Prepare a set of samples for parallel structural variant calling.

    CWL input target -- groups samples into batches and structural variant
    callers for parallel processing.
    """
    to_process, extras, background = _batch_split_by_sv(samples, "standard")
    out = [cwlutils.samples_to_records(xs) for xs in to_process.values()] + extras
    return out
Exemple #9
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a rtg SDF format file with build in indexes for retrieving
    sections of files.

    Retains back compatibility with bgzip/grabix approach.
    """
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
    approach = "grabix" if _has_grabix_indices(
        data) else dd.get_align_prep_method(data)
    data["files_orig"] = data["files"]
    if approach == "rtg":
        data["files"] = [rtg.to_sdf(data["files"], data)]
    else:
        data["files"] = _prep_grabix_indexes(data["files"], data["dirs"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        if approach == "rtg":
            splits = rtg.calculate_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        else:
            splits = _find_read_splits(
                data["files"][0],
                data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Exemple #10
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
Exemple #11
0
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a bgzip and grabix indexed file for retrieving sections
    of files.
    """
    from bcbio.pipeline import sample
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and
            (_is_cram_input(data["files"])
             or objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"]
                or data["files"][0] is None or not aligner):
            return [[data]]
        # if this is a DRAGEN BAM, we need to do further alignments with this BAM, so don't convert it
        if dd.get_umi_type(data) == "dragen":
            return [[data]]
    data["files_orig"] = data["files"]
    data["files"] = prep_fastq_inputs(data["files"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    # Handle any necessary trimming
    data = utils.to_single_data(sample.trim_sample(data)[0])
    _prep_grabix_indexes(data["files"], data)
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(
            data["files"][0],
            int(data["config"]["algorithm"]["align_split_size"]))
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records(
            [utils.to_single_data(x) for x in out],
            ["files", "align_split", "config__algorithm__quality_format"])
    return out
Exemple #12
0
def batch_for_variantcall(samples):
    """Prepare a set of samples for parallel variant calling.

    CWL input target that groups samples into batches and variant callers
    for parallel processing.
    """
    to_process, extras = _dup_samples_by_variantcaller(samples, require_bam=False)
    batch_groups = collections.defaultdict(list)
    to_process = [utils.to_single_data(x) for x in to_process]
    for data in cwlutils.samples_to_records(to_process):
        vc = get_variantcaller(data, require_bam=False)
        batches = dd.get_batches(data) or dd.get_sample_name(data)
        if not isinstance(batches, (list, tuple)):
            batches = [batches]
        for b in batches:
            batch_groups[(b, vc)].append(utils.deepish_copy(data))
    return list(batch_groups.values()) + extras
def create_inputs(data):
    """Index input reads and prepare groups of reads to process concurrently.

    Allows parallelization of alignment beyond processors available on a single
    machine. Prepares a bgzip and grabix indexed file for retrieving sections
    of files.
    """
    from bcbio.pipeline import sample
    data = cwlutils.normalize_missing(data)
    aligner = tz.get_in(("config", "algorithm", "aligner"), data)
    # CRAM files must be converted to bgzipped fastq, unless not aligning.
    # Also need to prep and download remote files.
    if not ("files" in data and data["files"] and aligner and (_is_cram_input(data["files"]) or
                                                               objectstore.is_remote(data["files"][0]))):
        # skip indexing on samples without input files or not doing alignment
        if ("files" not in data or not data["files"] or data["files"][0] is None or not aligner):
            return [[data]]
    data["files_orig"] = data["files"]
    data["files"] = _prep_fastq_inputs(data["files"], data)
    # preparation converts illumina into sanger format
    data["config"]["algorithm"]["quality_format"] = "standard"
    # Handle any necessary trimming
    data = utils.to_single_data(sample.trim_sample(data)[0])
    _prep_grabix_indexes(data["files"], data)
    data = _set_align_split_size(data)
    out = []
    if tz.get_in(["config", "algorithm", "align_split_size"], data):
        splits = _find_read_splits(data["files"][0], data["config"]["algorithm"]["align_split_size"])
        for split in splits:
            cur_data = copy.deepcopy(data)
            cur_data["align_split"] = split
            out.append([cur_data])
    else:
        out.append([data])
    if "output_cwl_keys" in data:
        out = cwlutils.samples_to_records([utils.to_single_data(x) for x in out],
                                          ["files", "align_split", "config__algorithm__quality_format"])
    return out
Exemple #14
0
def qc_to_rec(samples):
    """CWL: Convert a set of input samples into records for parallelization.
    """
    samples = [utils.to_single_data(x) for x in samples]
    return [[x] for x in cwlutils.samples_to_records(samples)]