def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree': """Read a PLINK .fam file and return a pedigree object. **Examples** >>> ped = hl.Pedigree.read('data/test.fam') Notes ------- See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for the required format. :param str fam_path: path to .fam file. :param str delimiter: Field delimiter. :rtype: :class:`.Pedigree` """ trios = [] missing_sex_count = 0 missing_sex_values = set() with Env.fs().open(fam_path) as file: for line in file: split_line = re.split(delimiter, line.strip()) num_fields = len(split_line) if num_fields != 6: raise FatalError( "Require 6 fields per line in .fam, but this line has {}: {}" .format(num_fields, line)) (fam, kid, dad, mom, sex, _) = tuple(split_line) # 1 is male, 2 is female, 0 is unknown. is_female = sex == "2" if sex == "1" or sex == "2" else None if is_female is None: missing_sex_count += 1 missing_sex_values.add(kid) trio = Trio(kid, fam if fam != "0" else None, dad if dad != "0" else None, mom if mom != "0" else None, is_female) trios.append(trio) only_ids = [trio.s for trio in trios] duplicate_ids = [ id for id, count in Counter(only_ids).items() if count > 1 ] if duplicate_ids: raise FatalError( "Invalid pedigree: found duplicate proband IDs\n{}".format( duplicate_ids)) if missing_sex_count > 0: warning( "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]" .format(missing_sex_count, missing_sex_values)) return Pedigree(trios)
def load(path) -> 'VariantDatasetCombiner': fs = hl.current_backend().fs with fs.open(path) as stream: combiner = json.load(stream, cls=Decoder) if combiner.save_path != path: warning( 'path/save_path mismatch in loaded VariantDatasetCombiner, using ' f'{path} as the new save_path for this combiner') combiner.save_path = path return combiner
def gvcf_batch_size(self, value: int): if value * len(self.gvcf_import_intervals ) > VariantDatasetCombiner.gvcf_merge_task_limit: old_value = value value = VariantDatasetCombiner.gvcf_merge_task_limit // len( self.gvcf_import_intervals) warning( f'gvcf_batch_size of {old_value} would produce too many tasks ' f'using {value} instead') self._gvcf_batch_size = value
def make_agg(self, mapping, precomputed): grouping_variables = { aes_key: mapping[aes_key] for aes_key in mapping.keys() if should_use_for_grouping(aes_key, mapping[aes_key].dtype) } start = self.min_val if self.min_val is not None else precomputed.min_val end = self.max_val if self.max_val is not None else precomputed.max_val if self.bins is None: warning( f"No number of bins was specfied for geom_histogram, defaulting to {self.DEFAULT_BINS} bins" ) bins = self.DEFAULT_BINS else: bins = self.bins return hl.agg.group_by(hl.struct(**grouping_variables), hl.agg.hist(mapping["x"], start, end, bins))
def maybe_load_from_saved_path( save_path: str) -> Optional[VariantDatasetCombiner]: if force: return None fs = hl.current_backend().fs if fs.exists(save_path): try: combiner = load_combiner(save_path) warning( f'found existing combiner plan at {save_path}, using it') # we overwrite these values as they are serialized, but not part of the # hash for an autogenerated name and we want users to be able to overwrite # these when resuming a combine (a common reason to need to resume a combine # is a failure due to branch factor being too large) combiner.branch_factor = branch_factor combiner.target_records = target_records combiner.gvcf_batch_size = batch_size return combiner except (ValueError, TypeError, OSError, KeyError): warning( f'file exists at {save_path}, but it is not a valid combiner plan, overwriting' ) return None
def hail_metadata(t_path): """Create a metadata plot for a Hail Table or MatrixTable. Parameters ---------- t_path : str Path to the Hail Table or MatrixTable files. Returns ------- :class:`bokeh.plotting.figure.Figure` or :class:`bokeh.models.widgets.panels.Tabs` or :class:`bokeh.models.layouts.Column` """ def get_rows_data(rows_files): file_sizes = [] partition_bounds = [] parts_file = [ x['path'] for x in rows_files if x['path'].endswith('parts') ] if parts_file: parts = hadoop_ls(parts_file[0]) for i, x in enumerate(parts): index = x['path'].split(f'{parts_file[0]}/part-')[1].split( '-')[0] if i < len(parts) - 1: test_index = parts[i + 1]['path'].split( f'{parts_file[0]}/part-')[1].split('-')[0] if test_index == index: continue file_sizes.append(x['size_bytes']) metadata_file = [ x['path'] for x in rows_files if x['path'].endswith('metadata.json.gz') ] if metadata_file: with hadoop_open(metadata_file[0], 'rb') as f: rows_meta = json.loads(f.read()) try: partition_bounds = [(x['start']['locus']['contig'], x['start']['locus']['position'], x['end']['locus']['contig'], x['end']['locus']['position']) for x in rows_meta['jRangeBounds']] except KeyError: pass return partition_bounds, file_sizes def scale_file_sizes(file_sizes): min_file_size = min(file_sizes) * 1.1 total_file_size = sum(file_sizes) all_scales = [('T', 1e12), ('G', 1e9), ('M', 1e6), ('K', 1e3), ('', 1e0)] for overall_scale, overall_factor in all_scales: if total_file_size > overall_factor: total_file_size /= overall_factor break for scale, factor in all_scales: if min_file_size > factor: file_sizes = [x / factor for x in file_sizes] break total_file_size = f'{total_file_size:.1f} {overall_scale}B' return total_file_size, file_sizes, scale files = hadoop_ls(t_path) rows_file = [x['path'] for x in files if x['path'].endswith('rows')] entries_file = [x['path'] for x in files if x['path'].endswith('entries')] success_file = [ x['modification_time'] for x in files if x['path'].endswith('SUCCESS') ] metadata_file = [ x['path'] for x in files if x['path'].endswith('metadata.json.gz') ] if not metadata_file: raise FileNotFoundError('No metadata.json.gz file found.') with hadoop_open(metadata_file[0], 'rb') as f: overall_meta = json.loads(f.read()) rows_per_partition = overall_meta['components']['partition_counts'][ 'counts'] if not rows_file: raise FileNotFoundError('No rows directory found.') rows_files = hadoop_ls(rows_file[0]) data_type = 'Table' if entries_file: data_type = 'MatrixTable' rows_file = [ x['path'] for x in rows_files if x['path'].endswith('rows') ] rows_files = hadoop_ls(rows_file[0]) row_partition_bounds, row_file_sizes = get_rows_data(rows_files) total_file_size, row_file_sizes, row_scale = scale_file_sizes( row_file_sizes) panel_size = 480 subpanel_size = 120 if not row_partition_bounds: warning('Table is not partitioned. Only plotting file sizes') row_file_sizes_hist, row_file_sizes_edges = np.histogram( row_file_sizes, bins=50) p_file_size = figure(plot_width=panel_size, plot_height=panel_size) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") p_file_size.yaxis.axis_label = f'File size ({row_scale}B)' return p_file_size all_data = { 'partition_widths': [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds], 'partition_bounds': [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds], 'spans_chromosome': [ 'Spans chromosomes' if x[0] != x[2] else 'Within chromosome' for x in row_partition_bounds ], 'row_file_sizes': row_file_sizes, 'row_file_sizes_human': [f'{x:.1f} {row_scale}B' for x in row_file_sizes], 'rows_per_partition': rows_per_partition, 'index': list(range(len(rows_per_partition))) } if entries_file: entries_rows_files = hadoop_ls(entries_file[0]) entries_rows_file = [ x['path'] for x in entries_rows_files if x['path'].endswith('rows') ] if entries_rows_file: entries_files = hadoop_ls(entries_rows_file[0]) entry_partition_bounds, entry_file_sizes = get_rows_data( entries_files) total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes( entry_file_sizes) all_data['entry_file_sizes'] = entry_file_sizes all_data['entry_file_sizes_human'] = [ f'{x:.1f} {entry_scale}B' for x in row_file_sizes ] title = f'{data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>" if success_file[0]: msg += success_file[0] tools = "hover,save,pan,box_zoom,reset,wheel_zoom" source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({row_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set(all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'row_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) rows_per_partition_hist, rows_per_partition_edges = np.histogram( all_data['rows_per_partition'], bins=50) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['row_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) if 'entry_file_sizes' in all_data: title = f'Statistics for {data_type}: {t_path}' msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>" if success_file[0]: msg += success_file[0] source = ColumnDataSource(pd.DataFrame(all_data)) p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size) p.title.text = title p.xaxis.axis_label = 'Number of rows' p.yaxis.axis_label = f'File size ({entry_scale}B)' color_map = factor_cmap('spans_chromosome', palette=Spectral8, factors=list(set( all_data['spans_chromosome']))) p.scatter('rows_per_partition', 'entry_file_sizes', color=color_map, legend='spans_chromosome', source=source) p.legend.location = 'bottom_right' p.select_one(HoverTool).tooltips = [ (x, f'@{x}') for x in ('rows_per_partition', 'entry_file_sizes_human', 'partition_bounds', 'index') ] p_stats = Div(text=msg) p_rows_per_partition = figure(x_range=p.x_range, plot_width=panel_size, plot_height=subpanel_size) p_rows_per_partition.quad(top=rows_per_partition_hist, bottom=0, left=rows_per_partition_edges[:-1], right=rows_per_partition_edges[1:], fill_color="#036564", line_color="#033649") p_file_size = figure(y_range=p.y_range, plot_width=subpanel_size, plot_height=panel_size) row_file_sizes_hist, row_file_sizes_edges = np.histogram( all_data['entry_file_sizes'], bins=50) p_file_size.quad(right=row_file_sizes_hist, left=0, bottom=row_file_sizes_edges[:-1], top=row_file_sizes_edges[1:], fill_color="#036564", line_color="#033649") entries_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]]) return Tabs(tabs=[ Panel(child=entries_grid, title='Entries'), Panel(child=rows_grid, title='Rows') ]) else: return rows_grid
def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=0, branching_factor=50, tmp_dir=None, default_reference='GRCh37', idempotent=False, global_seed=6348563392232659379, spark_conf=None, skip_logging_configuration=False, local_tmpdir=None, _optimizer_iterations=None): """Initialize Hail and Spark. Examples -------- Import and initialize Hail using GRCh38 as the default reference genome: >>> import hail as hl >>> hl.init(default_reference='GRCh38') # doctest: +SKIP Notes ----- Hail is not only a Python library; most of Hail is written in Java/Scala and runs together with Apache Spark in the Java Virtual Machine (JVM). In order to use Hail, a JVM needs to run as well. The :func:`.init` function is used to initialize Hail and Spark. This function also sets global configuration parameters used for the Hail session, like the default reference genome and log file location. This function will be called automatically (with default parameters) if any Hail functionality requiring the backend (most of the libary!) is used. To initialize Hail explicitly with non-default arguments, be sure to do so directly after importing the module, as in the above example. To facilitate the migration from Spark to the ServiceBackend, this method calls init_service when the environment variable HAIL_QUERY_BACKEND is set to "service". Note ---- If a :class:`pyspark.SparkContext` is already running, then Hail must be initialized with it as an argument: >>> hl.init(sc=sc) # doctest: +SKIP See Also -------- :func:`.stop` Parameters ---------- sc : pyspark.SparkContext, optional Spark context. By default, a Spark context will be created. app_name : :class:`str` Spark application name. master : :class:`str`, optional URL identifying the Spark leader (master) node or `local[N]` for local clusters. local : :class:`str` Local-mode core limit indicator. Must either be `local[N]` where N is a positive integer or `local[*]`. The latter indicates Spark should use all cores available. `local[*]` does not respect most containerization CPU limits. This option is only used if `master` is unset and `spark.master` is not set in the Spark configuration. log : :class:`str` Local path for Hail log file. Does not currently support distributed file systems like Google Storage, S3, or HDFS. quiet : :obj:`bool` Print fewer log messages. append : :obj:`bool` Append to the end of the log file. min_block_size : :obj:`int` Minimum file block size in MB. branching_factor : :obj:`int` Branching factor for tree aggregation. tmp_dir : :class:`str`, optional Networked temporary directory. Must be a network-visible file path. Defaults to /tmp in the default scheme. default_reference : :class:`str` Default reference genome. Either ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, or ``'CanFam3'``. idempotent : :obj:`bool` If ``True``, calling this function is a no-op if Hail has already been initialized. global_seed : :obj:`int`, optional Global random seed. spark_conf : :obj:`dict` of :class:`str` to :class`str`, optional Spark configuration parameters. skip_logging_configuration : :obj:`bool` Skip logging configuration in java and python. local_tmpdir : :class:`str`, optional Local temporary directory. Used on driver and executor nodes. Must use the file scheme. Defaults to TMPDIR, or /tmp. """ if Env._hc: if idempotent: return else: warning( 'Hail has already been initialized. If this call was intended to change configuration,' ' close the session with hl.stop() first.') if os.environ.get('HAIL_QUERY_BACKEND') == 'service': import asyncio # NB: do not use warning because that will initialize Env._hc, which we are trying to do right now. print( 'When using the query service backend, use `await init_service\'', file=sys.stderr) return asyncio.get_event_loop().run_until_complete( init_service( log=log, quiet=quiet, append=append, tmpdir=tmp_dir, local_tmpdir=local_tmpdir, default_reference=default_reference, global_seed=global_seed, skip_logging_configuration=skip_logging_configuration)) from hail.backend.spark_backend import SparkBackend log = _get_log(log) tmpdir = _get_tmpdir(tmp_dir) local_tmpdir = _get_local_tmpdir(local_tmpdir) optimizer_iterations = get_env_or_default(_optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) backend = SparkBackend(idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir, skip_logging_configuration, optimizer_iterations) if not backend.fs.exists(tmpdir): backend.fs.mkdir(tmpdir) HailContext.create(log, quiet, append, tmpdir, local_tmpdir, default_reference, global_seed, backend)
def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=0, branching_factor=50, tmp_dir='/tmp', default_reference='GRCh37', idempotent=False, global_seed=6348563392232659379, spark_conf=None, skip_logging_configuration=False, local_tmpdir=None, _optimizer_iterations=None): """Initialize Hail and Spark. Examples -------- Import and initialize Hail using GRCh38 as the default reference genome: >>> import hail as hl >>> hl.init(default_reference='GRCh38') # doctest: +SKIP Notes ----- Hail is not only a Python library; most of Hail is written in Java/Scala and runs together with Apache Spark in the Java Virtual Machine (JVM). In order to use Hail, a JVM needs to run as well. The :func:`.init` function is used to initialize Hail and Spark. This function also sets global configuration parameters used for the Hail session, like the default reference genome and log file location. This function will be called automatically (with default parameters) if any Hail functionality requiring the backend (most of the libary!) is used. To initialize Hail explicitly with non-default arguments, be sure to do so directly after importing the module, as in the above example. Note ---- If a :class:`pyspark.SparkContext` is already running, then Hail must be initialized with it as an argument: >>> hl.init(sc=sc) # doctest: +SKIP See Also -------- :func:`.stop` Parameters ---------- sc : pyspark.SparkContext, optional Spark context. By default, a Spark context will be created. app_name : :obj:`str` Spark application name. master : :obj:`str`, optional Spark master. local : :obj:`str` Local-mode master, used if `master` is not defined here or in the Spark configuration. log : :obj:`str` Local path for Hail log file. Does not currently support distributed file systems like Google Storage, S3, or HDFS. quiet : :obj:`bool` Print fewer log messages. append : :obj:`bool` Append to the end of the log file. min_block_size : :obj:`int` Minimum file block size in MB. branching_factor : :obj:`int` Branching factor for tree aggregation. tmp_dir : :obj:`str`, optional Networked temporary directory. Must be a network-visible file path. Defaults to /tmp in the default scheme. default_reference : :obj:`str` Default reference genome. Either ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, or ``'CanFam3'``. idempotent : :obj:`bool` If ``True``, calling this function is a no-op if Hail has already been initialized. global_seed : :obj:`int`, optional Global random seed. spark_conf : :obj:`dict[str, str]`, optional Spark configuration parameters. skip_logging_configuration : :obj:`bool` Skip logging configuration in java and python. local_tmpdir : :obj:`str`, optional Local temporary directory. Used on driver and executor nodes. Must use the file scheme. Defaults to TMPDIR, or /tmp. """ from hail.backend.spark_backend import SparkBackend if Env._hc: if idempotent: return else: warning( 'Hail has already been initialized. If this call was intended to change configuration,' ' close the session with hl.stop() first.') log = _get_log(log) tmpdir = _get_tmpdir(tmp_dir) local_tmpdir = _get_local_tmpdir(local_tmpdir) optimizer_iterations = get_env_or_default(_optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) backend = SparkBackend(idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmpdir, local_tmpdir, skip_logging_configuration, optimizer_iterations) HailContext(log, quiet, append, tmp_dir, local_tmpdir, default_reference, global_seed, backend)
def new_combiner( *, output_path: str, temp_path: str, save_path: Optional[str] = None, gvcf_paths: Optional[List[str]] = None, vds_paths: Optional[List[str]] = None, vds_sample_counts: Optional[List[int]] = None, intervals: Optional[List[Interval]] = None, import_interval_size: Optional[int] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, gvcf_external_header: Optional[str] = None, gvcf_sample_names: Optional[List[str]] = None, gvcf_info_to_keep: Optional[Collection[str]] = None, gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None, branch_factor: int = VariantDatasetCombiner.default_branch_factor, target_records: int = VariantDatasetCombiner.default_target_records, batch_size: int = VariantDatasetCombiner.default_gvcf_batch_size, reference_genome: Union[str, hl.ReferenceGenome] = 'default', contig_recoding: Optional[Dict[str, str]] = None, force: bool = False, ) -> VariantDatasetCombiner: if not (gvcf_paths or vds_paths): raise ValueError( "at least one of 'gvcf_paths' or 'vds_paths' must be nonempty") if gvcf_paths is None: gvcf_paths = [] if vds_paths is None: vds_paths = [] if vds_sample_counts is not None and len(vds_paths) != len( vds_sample_counts): raise ValueError( "'vds_paths' and 'vds_sample_counts' (if present) must have the same length " f'{len(vds_paths)} != {len(vds_sample_counts)}') if (gvcf_sample_names is None) != (gvcf_external_header is None): raise ValueError( "both 'gvcf_sample_names' and 'gvcf_external_header' must be set or unset" ) if gvcf_sample_names is not None and len(gvcf_sample_names) != len( gvcf_paths): raise ValueError( "'gvcf_sample_names' and 'gvcf_paths' must have the same length " f'{len(gvcf_sample_names)} != {len(gvcf_paths)}') n_partition_args = (int(intervals is not None) + int(import_interval_size is not None) + int(use_genome_default_intervals) + int(use_exome_default_intervals)) if n_partition_args == 0: raise ValueError( "'new_combiner': require one argument from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning" ) def maybe_load_from_saved_path( save_path: str) -> Optional[VariantDatasetCombiner]: if force: return None fs = hl.current_backend().fs if fs.exists(save_path): try: combiner = load_combiner(save_path) warning( f'found existing combiner plan at {save_path}, using it') # we overwrite these values as they are serialized, but not part of the # hash for an autogenerated name and we want users to be able to overwrite # these when resuming a combine (a common reason to need to resume a combine # is a failure due to branch factor being too large) combiner.branch_factor = branch_factor combiner.target_records = target_records combiner.gvcf_batch_size = batch_size return combiner except (ValueError, TypeError, OSError, KeyError): warning( f'file exists at {save_path}, but it is not a valid combiner plan, overwriting' ) return None # We do the first save_path check now after validating the arguments if save_path is not None: saved_combiner = maybe_load_from_saved_path(save_path) if saved_combiner is not None: return saved_combiner if n_partition_args > 1: warning( "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals'." "\n The argument found first in the list in this warning will be used, and others ignored." ) if intervals is not None: pass elif import_interval_size is not None: intervals = calculate_even_genome_partitioning(reference_genome, import_interval_size) elif use_genome_default_intervals: size = VariantDatasetCombiner.default_genome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) elif use_exome_default_intervals: size = VariantDatasetCombiner.default_exome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) assert intervals is not None if isinstance(reference_genome, str): reference_genome = hl.get_reference(reference_genome) if gvcf_reference_entry_fields_to_keep is None and vds_paths: vds = hl.vds.read_vds(vds_paths[0]) gvcf_reference_entry_fields_to_keep = set( vds.reference_data.entry) - {'END'} elif gvcf_reference_entry_fields_to_keep is None and gvcf_paths: mt = hl.import_vcf(gvcf_paths[0], force_bgz=True, reference_genome=reference_genome) mt = mt.filter_rows(hl.is_defined(mt.info.END)) gvcf_reference_entry_fields_to_keep = defined_entry_fields( mt, 100_000) - {'GT', 'PGT', 'PL'} if save_path is None: sha = hashlib.sha256() sha.update(output_path.encode()) sha.update(temp_path.encode()) sha.update(str(reference_genome).encode()) for path in vds_paths: sha.update(path.encode()) for path in gvcf_paths: sha.update(path.encode()) if gvcf_external_header is not None: sha.update(gvcf_external_header.encode()) if gvcf_sample_names is not None: for name in gvcf_sample_names: sha.update(name.encode()) if gvcf_info_to_keep is not None: for kept_info in sorted(gvcf_info_to_keep): sha.update(kept_info.encode()) if gvcf_reference_entry_fields_to_keep is not None: for field in sorted(gvcf_reference_entry_fields_to_keep): sha.update(field.encode()) if contig_recoding is not None: for key, value in sorted(contig_recoding.items()): sha.update(key.encode()) sha.update(value.encode()) for interval in intervals: sha.update(str(interval).encode()) digest = sha.hexdigest() name = f'vds-combiner-plan_{digest}_{hl.__pip_version__}.json' save_path = os.path.join(temp_path, 'combiner-plans', name) saved_combiner = maybe_load_from_saved_path(save_path) if saved_combiner is not None: return saved_combiner else: warning(f'generated combiner save path of {save_path}') if vds_sample_counts: vdses = [ VDSMetadata(path, n_samples) for path, n_samples in zip(vds_paths, vds_sample_counts) ] else: vdses = [] for path in vds_paths: vds = hl.vds.read_vds(path) n_samples = vds.n_samples() vdses.append(VDSMetadata(path, n_samples)) vdses.sort(key=lambda x: x.n_samples, reverse=True) return VariantDatasetCombiner( save_path=save_path, output_path=output_path, temp_path=temp_path, reference_genome=reference_genome, branch_factor=branch_factor, target_records=target_records, gvcf_batch_size=batch_size, contig_recoding=contig_recoding, vdses=vdses, gvcfs=gvcf_paths, gvcf_import_intervals=intervals, gvcf_external_header=gvcf_external_header, gvcf_sample_names=gvcf_sample_names, gvcf_info_to_keep=gvcf_info_to_keep, gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep )
def run_combiner(sample_paths: List[str], out_file: str, tmp_path: str, *, intervals: Optional[List[hl.utils.Interval]] = None, import_interval_size: Optional[int] = None, use_genome_default_intervals: bool = False, use_exome_default_intervals: bool = False, header: Optional[str] = None, sample_names: Optional[List[str]] = None, branch_factor: int = CombinerConfig.default_branch_factor, batch_size: int = CombinerConfig.default_batch_size, target_records: int = CombinerConfig.default_target_records, overwrite: bool = False, reference_genome: str = 'default', contig_recoding: Optional[Dict[str, str]] = None, key_by_locus_and_alleles: bool = False): """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table. **Partitioning** The partitioning of input GVCFs, which determines the maximum parallelism per file, is determined the four parameters below. One of these parameters must be passed to this function. - `intervals` -- User-supplied intervals. - `import_interval_size` -- Use intervals of this uniform size across the genome. - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole genome GVCFs. - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome GVCFs. It is recommended that new users include either `use_genome_default_intervals` or `use_exome_default_intervals`. Note also that the partitioning of the final, combined matrix table does not depend the GVCF input partitioning. Parameters ---------- sample_paths : :obj:`list` of :class:`str` Paths to individual GVCFs. out_file : :class:`str` Path to final combined matrix table. tmp_path : :class:`str` Path for intermediate output. intervals : list of :class:`.Interval` or None Import GVCFs with specified partition intervals. import_interval_size : :obj:`int` or None Import GVCFs with uniform partition intervals of specified size. use_genome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for whole-genome data. use_exome_default_intervals : :obj:`bool` Import GVCFs with uniform partition intervals of default size for exome data. header : :class:`str` or None External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well. sample_names: list of :class:`str` or None Sample names, to be used with `header`. branch_factor : :obj:`int` Combiner branch factor. batch_size : :obj:`int` Combiner batch size. target_records : :obj:`int` Target records per partition in each combiner phase after the first. overwrite : :obj:`bool` Overwrite output file, if it exists. reference_genome : :class:`str` Reference genome for GVCF import. contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional Mapping from contig name in gVCFs to contig name the reference genome. All contigs must be present in the `reference_genome`, so this is useful for mapping differently-formatted data onto known references. key_by_locus_and_alleles : :obj:`bool` Key by both locus and alleles in the final output. Returns ------- None """ tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' if header is not None: assert sample_names is not None assert len(sample_names) == len(sample_paths) n_partition_args = (int(intervals is not None) + int(import_interval_size is not None) + int(use_genome_default_intervals) + int(use_exome_default_intervals)) if n_partition_args == 0: raise ValueError( "'run_combiner': require one argument from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning" ) if n_partition_args > 1: warning( "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', " "'use_genome_default_intervals', or 'use_exome_default_intervals'." "\n The argument found first in the list in this warning will be used, and others ignored." ) if intervals is not None: info( f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import" ) elif import_interval_size is not None: intervals = calculate_even_genome_partitioning(reference_genome, import_interval_size) info(f"Using {len(intervals)} intervals with user-supplied size" f" {import_interval_size} as partitioning for GVCF import") elif use_genome_default_intervals: size = CombinerConfig.default_genome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default whole-genome size" f" {size} as partitioning for GVCF import") elif use_exome_default_intervals: size = CombinerConfig.default_exome_interval_size intervals = calculate_even_genome_partitioning(reference_genome, size) info(f"Using {len(intervals)} intervals with default exome size" f" {size} as partitioning for GVCF import") assert intervals is not None config = CombinerConfig(branch_factor=branch_factor, batch_size=batch_size, target_records=target_records) plan = config.plan(len(sample_paths)) files_to_merge = sample_paths n_phases = len(plan.phases) total_ops = len(files_to_merge) * n_phases total_work_done = 0 for phase_i, phase in enumerate(plan.phases): phase_i += 1 # used for info messages, 1-indexed for readability n_jobs = len(phase.jobs) merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables' job_str = hl.utils.misc.plural('job', n_jobs) info( f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}." ) if phase_i > 1: intervals = calculate_new_intervals( hl.read_matrix_table(files_to_merge[0]).rows(), config.target_records, reference_genome=reference_genome) new_files_to_merge = [] for job_i, job in enumerate(phase.jobs): job_i += 1 # used for info messages, 1-indexed for readability n_merges = len(job.merges) merge_str = hl.utils.misc.plural('file', n_merges) pct_total = 100 * job.input_total_size / total_ops info( f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O." ) merge_mts: List[MatrixTable] = [] for merge in job.merges: inputs = [files_to_merge[i] for i in merge.inputs] if phase_i == 1: mts = [ transform_gvcf(vcf) for vcf in hl.import_gvcfs( inputs, intervals, array_elements_required=False, _external_header=header, _external_sample_ids=[[sample_names[i]] for i in merge.inputs] if header is not None else None, reference_genome=reference_genome, contig_recoding=contig_recoding) ] else: mts = [ hl.read_matrix_table(path, _intervals=intervals) for path in inputs ] merge_mts.append(combine_gvcfs(mts)) if phase_i == n_phases: # final merge! assert n_jobs == 1 assert len(merge_mts) == 1 [final_mt] = merge_mts if key_by_locus_and_alleles: final_mt = MatrixTable( MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'], is_sorted=True)) final_mt.write(out_file, overwrite=overwrite) new_files_to_merge = [out_file] info( f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished." ) break tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/' hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True) pad = len(str(len(merge_mts))) new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt' for n in range(len(merge_mts))) total_work_done += job.input_total_size info( f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished." ) info(f"Finished phase {phase_i}/{n_phases}.") files_to_merge = new_files_to_merge assert files_to_merge == [out_file] info("Finished!")