Beispiel #1
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    Parameters
    ----------
    sample_paths : :obj:`list` of :obj:`str`
        Paths to individual GVCFs.
    out_file : :obj:`str`
        Path to final combined matrix table.
    tmp_path : :obj:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Partitioning with which to import GVCFs in first phase of combiner.
    header : :obj:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :obj:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :obj:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    # FIXME: this should be hl.default_reference().even_intervals_contig_boundary
    intervals = intervals or default_exome_intervals(reference_genome)

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[
                                sample_names[i] for i in merge.inputs
                            ] if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")
Beispiel #2
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 *,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 import_interval_size: Optional[int] = None,
                 use_genome_default_intervals: bool = False,
                 use_exome_default_intervals: bool = False,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    **Partitioning**

    The partitioning of input GVCFs, which determines the maximum parallelism per file,
    is determined the four parameters below. One of these parameters must be passed to
    this function.

    - `intervals` -- User-supplied intervals.
    - `import_interval_size` -- Use intervals of this uniform size across the genome.
    - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole
      genome GVCFs.
    - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome
      GVCFs.

    It is recommended that new users include either `use_genome_default_intervals` or
    `use_exome_default_intervals`.

    Note also that the partitioning of the final, combined matrix table does not depend
    the GVCF input partitioning.

    Parameters
    ----------
    sample_paths : :obj:`list` of :class:`str`
        Paths to individual GVCFs.
    out_file : :class:`str`
        Path to final combined matrix table.
    tmp_path : :class:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Import GVCFs with specified partition intervals.
    import_interval_size : :obj:`int` or None
        Import GVCFs with uniform partition intervals of specified size.
    use_genome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        whole-genome data.
    use_exome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        exome data.
    header : :class:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :class:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :class:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    n_partition_args = (int(intervals is not None) +
                        int(import_interval_size is not None) +
                        int(use_genome_default_intervals) +
                        int(use_exome_default_intervals))

    if n_partition_args == 0:
        raise ValueError(
            "'run_combiner': require one argument from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning"
        )
    if n_partition_args > 1:
        warning(
            "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals'."
            "\n  The argument found first in the list in this warning will be used, and others ignored."
        )

    if intervals is not None:
        info(
            f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import"
        )
    elif import_interval_size is not None:
        intervals = calculate_even_genome_partitioning(reference_genome,
                                                       import_interval_size)
        info(f"Using {len(intervals)} intervals with user-supplied size"
             f" {import_interval_size} as partitioning for GVCF import")
    elif use_genome_default_intervals:
        size = CombinerConfig.default_genome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default whole-genome size"
             f" {size} as partitioning for GVCF import")
    elif use_exome_default_intervals:
        size = CombinerConfig.default_exome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default exome size"
             f" {size} as partitioning for GVCF import")

    assert intervals is not None

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[[sample_names[i]]
                                                  for i in merge.inputs]
                            if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")