def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, intervals: Optional[List[hl.utils.Interval]] = None, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. Parameters ---------- sample_paths : :obj:`list` of :obj:`str` Paths to individual GVCFs. out_file : :obj:`str` Path to final combined matrix table. tmp_path : :obj:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Partitioning with which to import GVCFs in first phase of combiner. header : :obj:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :obj:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :obj:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:obj:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) # FIXME: this should be hl.default_reference().even_intervals_contig_boundary intervals = intervals or default_exome_intervals(reference_genome) config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[ sample_names[i] for i in merge.inputs ] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")
def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, *, intervals: Optional[List[hl.utils.Interval]] = None, import_interval_size: Optional[int] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. **Partitioning** The partitioning of input GVCFs, which determines the maximum parallelism per file, is determined the four parameters below. One of these parameters must be passed to this function. - `intervals` -- User-supplied intervals. - `import_interval_size` -- Use intervals of this uniform size across the genome. - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole genome GVCFs. - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome GVCFs. It is recommended that new users include either `use_genome_default_intervals` or `use_exome_default_intervals`. Note also that the partitioning of the final, combined matrix table does not depend the GVCF input partitioning. Parameters ---------- sample_paths : :obj:`list` of :class:`str` Paths to individual GVCFs. out_file : :class:`str` Path to final combined matrix table. tmp_path : :class:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Import GVCFs with specified partition intervals. import_interval_size : :obj:`int` or None Import GVCFs with uniform partition intervals of specified size. use_genome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for whole-genome data. use_exome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for exome data. header : :class:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :class:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :class:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) n_partition_args = (int(intervals is not None) + int(import_interval_size is not None) + int(use_genome_default_intervals) + int(use_exome_default_intervals)) if n_partition_args == 0: raise ValueError( "'run_combiner': require one argument from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning" ) if n_partition_args > 1: warning( "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals'." "\n The argument found first in the list in this warning will be used, and others ignored." ) if intervals is not None: info( f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import" ) elif import_interval_size is not None: intervals = calculate_even_genome_partitioning(reference_genome, import_interval_size) info(f"Using {len(intervals)} intervals with user-supplied size" f" {import_interval_size} as partitioning for GVCF import") elif use_genome_default_intervals: size = CombinerConfig.default_genome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default whole-genome size" f" {size} as partitioning for GVCF import") elif use_exome_default_intervals: size = CombinerConfig.default_exome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default exome size" f" {size} as partitioning for GVCF import") assert intervals is not None config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[[sample_names[i]] for i in merge.inputs] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")