Exemple #1
0
    def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree':
        """Read a PLINK .fam file and return a pedigree object.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')

        Notes
        -------

        See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for
        the required format.

        :param str fam_path: path to .fam file.

        :param str delimiter: Field delimiter.

        :rtype: :class:`.Pedigree`
        """

        trios = []
        missing_sex_count = 0
        missing_sex_values = set()
        with Env.fs().open(fam_path) as file:
            for line in file:
                split_line = re.split(delimiter, line.strip())
                num_fields = len(split_line)
                if num_fields != 6:
                    raise FatalError(
                        "Require 6 fields per line in .fam, but this line has {}: {}"
                        .format(num_fields, line))
                (fam, kid, dad, mom, sex, _) = tuple(split_line)
                # 1 is male, 2 is female, 0 is unknown.
                is_female = sex == "2" if sex == "1" or sex == "2" else None

                if is_female is None:
                    missing_sex_count += 1
                    missing_sex_values.add(kid)

                trio = Trio(kid, fam if fam != "0" else None,
                            dad if dad != "0" else None,
                            mom if mom != "0" else None, is_female)
                trios.append(trio)

        only_ids = [trio.s for trio in trios]
        duplicate_ids = [
            id for id, count in Counter(only_ids).items() if count > 1
        ]
        if duplicate_ids:
            raise FatalError(
                "Invalid pedigree: found duplicate proband IDs\n{}".format(
                    duplicate_ids))

        if missing_sex_count > 0:
            warning(
                "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]"
                .format(missing_sex_count, missing_sex_values))

        return Pedigree(trios)
 def load(path) -> 'VariantDatasetCombiner':
     fs = hl.current_backend().fs
     with fs.open(path) as stream:
         combiner = json.load(stream, cls=Decoder)
         if combiner.save_path != path:
             warning(
                 'path/save_path mismatch in loaded VariantDatasetCombiner, using '
                 f'{path} as the new save_path for this combiner')
             combiner.save_path = path
         return combiner
 def gvcf_batch_size(self, value: int):
     if value * len(self.gvcf_import_intervals
                    ) > VariantDatasetCombiner.gvcf_merge_task_limit:
         old_value = value
         value = VariantDatasetCombiner.gvcf_merge_task_limit // len(
             self.gvcf_import_intervals)
         warning(
             f'gvcf_batch_size of {old_value} would produce too many tasks '
             f'using {value} instead')
     self._gvcf_batch_size = value
Exemple #4
0
    def make_agg(self, mapping, precomputed):
        grouping_variables = {
            aes_key: mapping[aes_key]
            for aes_key in mapping.keys()
            if should_use_for_grouping(aes_key, mapping[aes_key].dtype)
        }

        start = self.min_val if self.min_val is not None else precomputed.min_val
        end = self.max_val if self.max_val is not None else precomputed.max_val
        if self.bins is None:
            warning(
                f"No number of bins was specfied for geom_histogram, defaulting to {self.DEFAULT_BINS} bins"
            )
            bins = self.DEFAULT_BINS
        else:
            bins = self.bins
        return hl.agg.group_by(hl.struct(**grouping_variables),
                               hl.agg.hist(mapping["x"], start, end, bins))
 def maybe_load_from_saved_path(
         save_path: str) -> Optional[VariantDatasetCombiner]:
     if force:
         return None
     fs = hl.current_backend().fs
     if fs.exists(save_path):
         try:
             combiner = load_combiner(save_path)
             warning(
                 f'found existing combiner plan at {save_path}, using it')
             # we overwrite these values as they are serialized, but not part of the
             # hash for an autogenerated name and we want users to be able to overwrite
             # these when resuming a combine (a common reason to need to resume a combine
             # is a failure due to branch factor being too large)
             combiner.branch_factor = branch_factor
             combiner.target_records = target_records
             combiner.gvcf_batch_size = batch_size
             return combiner
         except (ValueError, TypeError, OSError, KeyError):
             warning(
                 f'file exists at {save_path}, but it is not a valid combiner plan, overwriting'
             )
     return None
Exemple #6
0
def hail_metadata(t_path):
    """Create a metadata plot for a Hail Table or MatrixTable.

    Parameters
    ----------
    t_path : str
        Path to the Hail Table or MatrixTable files.

    Returns
    -------
    :class:`bokeh.plotting.figure.Figure` or :class:`bokeh.models.widgets.panels.Tabs` or :class:`bokeh.models.layouts.Column`
    """
    def get_rows_data(rows_files):
        file_sizes = []
        partition_bounds = []
        parts_file = [
            x['path'] for x in rows_files if x['path'].endswith('parts')
        ]
        if parts_file:
            parts = hadoop_ls(parts_file[0])
            for i, x in enumerate(parts):
                index = x['path'].split(f'{parts_file[0]}/part-')[1].split(
                    '-')[0]
                if i < len(parts) - 1:
                    test_index = parts[i + 1]['path'].split(
                        f'{parts_file[0]}/part-')[1].split('-')[0]
                    if test_index == index:
                        continue
                file_sizes.append(x['size_bytes'])
        metadata_file = [
            x['path'] for x in rows_files
            if x['path'].endswith('metadata.json.gz')
        ]
        if metadata_file:
            with hadoop_open(metadata_file[0], 'rb') as f:
                rows_meta = json.loads(f.read())
                try:
                    partition_bounds = [(x['start']['locus']['contig'],
                                         x['start']['locus']['position'],
                                         x['end']['locus']['contig'],
                                         x['end']['locus']['position'])
                                        for x in rows_meta['jRangeBounds']]
                except KeyError:
                    pass
        return partition_bounds, file_sizes

    def scale_file_sizes(file_sizes):
        min_file_size = min(file_sizes) * 1.1
        total_file_size = sum(file_sizes)
        all_scales = [('T', 1e12), ('G', 1e9), ('M', 1e6), ('K', 1e3),
                      ('', 1e0)]
        for overall_scale, overall_factor in all_scales:
            if total_file_size > overall_factor:
                total_file_size /= overall_factor
                break
        for scale, factor in all_scales:
            if min_file_size > factor:
                file_sizes = [x / factor for x in file_sizes]
                break
        total_file_size = f'{total_file_size:.1f} {overall_scale}B'
        return total_file_size, file_sizes, scale

    files = hadoop_ls(t_path)

    rows_file = [x['path'] for x in files if x['path'].endswith('rows')]
    entries_file = [x['path'] for x in files if x['path'].endswith('entries')]
    success_file = [
        x['modification_time'] for x in files if x['path'].endswith('SUCCESS')
    ]

    metadata_file = [
        x['path'] for x in files if x['path'].endswith('metadata.json.gz')
    ]
    if not metadata_file:
        raise FileNotFoundError('No metadata.json.gz file found.')

    with hadoop_open(metadata_file[0], 'rb') as f:
        overall_meta = json.loads(f.read())
        rows_per_partition = overall_meta['components']['partition_counts'][
            'counts']

    if not rows_file:
        raise FileNotFoundError('No rows directory found.')
    rows_files = hadoop_ls(rows_file[0])

    data_type = 'Table'
    if entries_file:
        data_type = 'MatrixTable'
        rows_file = [
            x['path'] for x in rows_files if x['path'].endswith('rows')
        ]
        rows_files = hadoop_ls(rows_file[0])
    row_partition_bounds, row_file_sizes = get_rows_data(rows_files)

    total_file_size, row_file_sizes, row_scale = scale_file_sizes(
        row_file_sizes)

    panel_size = 480
    subpanel_size = 120

    if not row_partition_bounds:
        warning('Table is not partitioned. Only plotting file sizes')
        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            row_file_sizes, bins=50)
        p_file_size = figure(plot_width=panel_size, plot_height=panel_size)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        p_file_size.yaxis.axis_label = f'File size ({row_scale}B)'
        return p_file_size

    all_data = {
        'partition_widths':
        [-1 if x[0] != x[2] else x[3] - x[1] for x in row_partition_bounds],
        'partition_bounds':
        [f'{x[0]}:{x[1]}-{x[2]}:{x[3]}' for x in row_partition_bounds],
        'spans_chromosome': [
            'Spans chromosomes' if x[0] != x[2] else 'Within chromosome'
            for x in row_partition_bounds
        ],
        'row_file_sizes':
        row_file_sizes,
        'row_file_sizes_human':
        [f'{x:.1f} {row_scale}B' for x in row_file_sizes],
        'rows_per_partition':
        rows_per_partition,
        'index':
        list(range(len(rows_per_partition)))
    }

    if entries_file:
        entries_rows_files = hadoop_ls(entries_file[0])
        entries_rows_file = [
            x['path'] for x in entries_rows_files if x['path'].endswith('rows')
        ]
        if entries_rows_file:
            entries_files = hadoop_ls(entries_rows_file[0])
            entry_partition_bounds, entry_file_sizes = get_rows_data(
                entries_files)
            total_entry_file_size, entry_file_sizes, entry_scale = scale_file_sizes(
                entry_file_sizes)
            all_data['entry_file_sizes'] = entry_file_sizes
            all_data['entry_file_sizes_human'] = [
                f'{x:.1f} {entry_scale}B' for x in row_file_sizes
            ]

    title = f'{data_type}: {t_path}'

    msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_file_size}<br/>"
    if success_file[0]:
        msg += success_file[0]

    tools = "hover,save,pan,box_zoom,reset,wheel_zoom"

    source = ColumnDataSource(pd.DataFrame(all_data))
    p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size)
    p.title.text = title
    p.xaxis.axis_label = 'Number of rows'
    p.yaxis.axis_label = f'File size ({row_scale}B)'
    color_map = factor_cmap('spans_chromosome',
                            palette=Spectral8,
                            factors=list(set(all_data['spans_chromosome'])))
    p.scatter('rows_per_partition',
              'row_file_sizes',
              color=color_map,
              legend='spans_chromosome',
              source=source)
    p.legend.location = 'bottom_right'
    p.select_one(HoverTool).tooltips = [
        (x, f'@{x}') for x in ('rows_per_partition', 'row_file_sizes_human',
                               'partition_bounds', 'index')
    ]

    p_stats = Div(text=msg)
    p_rows_per_partition = figure(x_range=p.x_range,
                                  plot_width=panel_size,
                                  plot_height=subpanel_size)
    p_file_size = figure(y_range=p.y_range,
                         plot_width=subpanel_size,
                         plot_height=panel_size)

    rows_per_partition_hist, rows_per_partition_edges = np.histogram(
        all_data['rows_per_partition'], bins=50)
    p_rows_per_partition.quad(top=rows_per_partition_hist,
                              bottom=0,
                              left=rows_per_partition_edges[:-1],
                              right=rows_per_partition_edges[1:],
                              fill_color="#036564",
                              line_color="#033649")
    row_file_sizes_hist, row_file_sizes_edges = np.histogram(
        all_data['row_file_sizes'], bins=50)
    p_file_size.quad(right=row_file_sizes_hist,
                     left=0,
                     bottom=row_file_sizes_edges[:-1],
                     top=row_file_sizes_edges[1:],
                     fill_color="#036564",
                     line_color="#033649")

    rows_grid = gridplot([[p_rows_per_partition, p_stats], [p, p_file_size]])

    if 'entry_file_sizes' in all_data:
        title = f'Statistics for {data_type}: {t_path}'

        msg = f"Rows: {sum(all_data['rows_per_partition']):,}<br/>Partitions: {len(all_data['rows_per_partition']):,}<br/>Size: {total_entry_file_size}<br/>"
        if success_file[0]:
            msg += success_file[0]

        source = ColumnDataSource(pd.DataFrame(all_data))
        p = figure(tools=tools, plot_width=panel_size, plot_height=panel_size)
        p.title.text = title
        p.xaxis.axis_label = 'Number of rows'
        p.yaxis.axis_label = f'File size ({entry_scale}B)'
        color_map = factor_cmap('spans_chromosome',
                                palette=Spectral8,
                                factors=list(set(
                                    all_data['spans_chromosome'])))
        p.scatter('rows_per_partition',
                  'entry_file_sizes',
                  color=color_map,
                  legend='spans_chromosome',
                  source=source)
        p.legend.location = 'bottom_right'
        p.select_one(HoverTool).tooltips = [
            (x, f'@{x}')
            for x in ('rows_per_partition', 'entry_file_sizes_human',
                      'partition_bounds', 'index')
        ]

        p_stats = Div(text=msg)
        p_rows_per_partition = figure(x_range=p.x_range,
                                      plot_width=panel_size,
                                      plot_height=subpanel_size)
        p_rows_per_partition.quad(top=rows_per_partition_hist,
                                  bottom=0,
                                  left=rows_per_partition_edges[:-1],
                                  right=rows_per_partition_edges[1:],
                                  fill_color="#036564",
                                  line_color="#033649")
        p_file_size = figure(y_range=p.y_range,
                             plot_width=subpanel_size,
                             plot_height=panel_size)

        row_file_sizes_hist, row_file_sizes_edges = np.histogram(
            all_data['entry_file_sizes'], bins=50)
        p_file_size.quad(right=row_file_sizes_hist,
                         left=0,
                         bottom=row_file_sizes_edges[:-1],
                         top=row_file_sizes_edges[1:],
                         fill_color="#036564",
                         line_color="#033649")
        entries_grid = gridplot([[p_rows_per_partition, p_stats],
                                 [p, p_file_size]])

        return Tabs(tabs=[
            Panel(child=entries_grid, title='Entries'),
            Panel(child=rows_grid, title='Rows')
        ])
    else:
        return rows_grid
Exemple #7
0
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
         log=None,
         quiet=False,
         append=False,
         min_block_size=0,
         branching_factor=50,
         tmp_dir=None,
         default_reference='GRCh37',
         idempotent=False,
         global_seed=6348563392232659379,
         spark_conf=None,
         skip_logging_configuration=False,
         local_tmpdir=None,
         _optimizer_iterations=None):
    """Initialize Hail and Spark.

    Examples
    --------
    Import and initialize Hail using GRCh38 as the default reference genome:

    >>> import hail as hl
    >>> hl.init(default_reference='GRCh38')  # doctest: +SKIP

    Notes
    -----
    Hail is not only a Python library; most of Hail is written in Java/Scala
    and runs together with Apache Spark in the Java Virtual Machine (JVM).
    In order to use Hail, a JVM needs to run as well. The :func:`.init`
    function is used to initialize Hail and Spark.

    This function also sets global configuration parameters used for the Hail
    session, like the default reference genome and log file location.

    This function will be called automatically (with default parameters) if
    any Hail functionality requiring the backend (most of the libary!) is used.
    To initialize Hail explicitly with non-default arguments, be sure to do so
    directly after importing the module, as in the above example.

    To facilitate the migration from Spark to the ServiceBackend, this method
    calls init_service when the environment variable HAIL_QUERY_BACKEND is set
    to "service".

    Note
    ----
    If a :class:`pyspark.SparkContext` is already running, then Hail must be
    initialized with it as an argument:

    >>> hl.init(sc=sc)  # doctest: +SKIP

    See Also
    --------
    :func:`.stop`

    Parameters
    ----------
    sc : pyspark.SparkContext, optional
        Spark context. By default, a Spark context will be created.
    app_name : :class:`str`
        Spark application name.
    master : :class:`str`, optional
        URL identifying the Spark leader (master) node or `local[N]` for local clusters.
    local : :class:`str`
       Local-mode core limit indicator. Must either be `local[N]` where N is a
       positive integer or `local[*]`. The latter indicates Spark should use all
       cores available. `local[*]` does not respect most containerization CPU
       limits. This option is only used if `master` is unset and `spark.master`
       is not set in the Spark configuration.
    log : :class:`str`
        Local path for Hail log file. Does not currently support distributed
        file systems like Google Storage, S3, or HDFS.
    quiet : :obj:`bool`
        Print fewer log messages.
    append : :obj:`bool`
        Append to the end of the log file.
    min_block_size : :obj:`int`
        Minimum file block size in MB.
    branching_factor : :obj:`int`
        Branching factor for tree aggregation.
    tmp_dir : :class:`str`, optional
        Networked temporary directory.  Must be a network-visible file
        path.  Defaults to /tmp in the default scheme.
    default_reference : :class:`str`
        Default reference genome. Either ``'GRCh37'``, ``'GRCh38'``,
        ``'GRCm38'``, or ``'CanFam3'``.
    idempotent : :obj:`bool`
        If ``True``, calling this function is a no-op if Hail has already been initialized.
    global_seed : :obj:`int`, optional
        Global random seed.
    spark_conf : :obj:`dict` of :class:`str` to :class`str`, optional
        Spark configuration parameters.
    skip_logging_configuration : :obj:`bool`
        Skip logging configuration in java and python.
    local_tmpdir : :class:`str`, optional
        Local temporary directory.  Used on driver and executor nodes.
        Must use the file scheme.  Defaults to TMPDIR, or /tmp.
    """
    if Env._hc:
        if idempotent:
            return
        else:
            warning(
                'Hail has already been initialized. If this call was intended to change configuration,'
                ' close the session with hl.stop() first.')

    if os.environ.get('HAIL_QUERY_BACKEND') == 'service':
        import asyncio
        # NB: do not use warning because that will initialize Env._hc, which we are trying to do right now.
        print(
            'When using the query service backend, use `await init_service\'',
            file=sys.stderr)
        return asyncio.get_event_loop().run_until_complete(
            init_service(
                log=log,
                quiet=quiet,
                append=append,
                tmpdir=tmp_dir,
                local_tmpdir=local_tmpdir,
                default_reference=default_reference,
                global_seed=global_seed,
                skip_logging_configuration=skip_logging_configuration))

    from hail.backend.spark_backend import SparkBackend

    log = _get_log(log)
    tmpdir = _get_tmpdir(tmp_dir)
    local_tmpdir = _get_local_tmpdir(local_tmpdir)
    optimizer_iterations = get_env_or_default(_optimizer_iterations,
                                              'HAIL_OPTIMIZER_ITERATIONS', 3)

    backend = SparkBackend(idempotent, sc, spark_conf, app_name, master, local,
                           log, quiet, append, min_block_size,
                           branching_factor, tmpdir, local_tmpdir,
                           skip_logging_configuration, optimizer_iterations)

    if not backend.fs.exists(tmpdir):
        backend.fs.mkdir(tmpdir)

    HailContext.create(log, quiet, append, tmpdir, local_tmpdir,
                       default_reference, global_seed, backend)
Exemple #8
0
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
         log=None,
         quiet=False,
         append=False,
         min_block_size=0,
         branching_factor=50,
         tmp_dir='/tmp',
         default_reference='GRCh37',
         idempotent=False,
         global_seed=6348563392232659379,
         spark_conf=None,
         skip_logging_configuration=False,
         local_tmpdir=None,
         _optimizer_iterations=None):
    """Initialize Hail and Spark.

    Examples
    --------
    Import and initialize Hail using GRCh38 as the default reference genome:

    >>> import hail as hl
    >>> hl.init(default_reference='GRCh38')  # doctest: +SKIP

    Notes
    -----
    Hail is not only a Python library; most of Hail is written in Java/Scala
    and runs together with Apache Spark in the Java Virtual Machine (JVM).
    In order to use Hail, a JVM needs to run as well. The :func:`.init`
    function is used to initialize Hail and Spark.

    This function also sets global configuration parameters used for the Hail
    session, like the default reference genome and log file location.

    This function will be called automatically (with default parameters) if
    any Hail functionality requiring the backend (most of the libary!) is used.
    To initialize Hail explicitly with non-default arguments, be sure to do so
    directly after importing the module, as in the above example.

    Note
    ----
    If a :class:`pyspark.SparkContext` is already running, then Hail must be
    initialized with it as an argument:

    >>> hl.init(sc=sc)  # doctest: +SKIP

    See Also
    --------
    :func:`.stop`

    Parameters
    ----------
    sc : pyspark.SparkContext, optional
        Spark context. By default, a Spark context will be created.
    app_name : :obj:`str`
        Spark application name.
    master : :obj:`str`, optional
        Spark master.
    local : :obj:`str`
       Local-mode master, used if `master` is not defined here or in the
       Spark configuration.
    log : :obj:`str`
        Local path for Hail log file. Does not currently support distributed
        file systems like Google Storage, S3, or HDFS.
    quiet : :obj:`bool`
        Print fewer log messages.
    append : :obj:`bool`
        Append to the end of the log file.
    min_block_size : :obj:`int`
        Minimum file block size in MB.
    branching_factor : :obj:`int`
        Branching factor for tree aggregation.
    tmp_dir : :obj:`str`, optional
        Networked temporary directory.  Must be a network-visible file
        path.  Defaults to /tmp in the default scheme.
    default_reference : :obj:`str`
        Default reference genome. Either ``'GRCh37'``, ``'GRCh38'``,
        ``'GRCm38'``, or ``'CanFam3'``.
    idempotent : :obj:`bool`
        If ``True``, calling this function is a no-op if Hail has already been initialized.
    global_seed : :obj:`int`, optional
        Global random seed.
    spark_conf : :obj:`dict[str, str]`, optional
        Spark configuration parameters.
    skip_logging_configuration : :obj:`bool`
        Skip logging configuration in java and python.
    local_tmpdir : :obj:`str`, optional
        Local temporary directory.  Used on driver and executor nodes.
        Must use the file scheme.  Defaults to TMPDIR, or /tmp.
    """
    from hail.backend.spark_backend import SparkBackend

    if Env._hc:
        if idempotent:
            return
        else:
            warning(
                'Hail has already been initialized. If this call was intended to change configuration,'
                ' close the session with hl.stop() first.')

    log = _get_log(log)
    tmpdir = _get_tmpdir(tmp_dir)
    local_tmpdir = _get_local_tmpdir(local_tmpdir)
    optimizer_iterations = get_env_or_default(_optimizer_iterations,
                                              'HAIL_OPTIMIZER_ITERATIONS', 3)

    backend = SparkBackend(idempotent, sc, spark_conf, app_name, master, local,
                           log, quiet, append, min_block_size,
                           branching_factor, tmpdir, local_tmpdir,
                           skip_logging_configuration, optimizer_iterations)

    HailContext(log, quiet, append, tmp_dir, local_tmpdir, default_reference,
                global_seed, backend)
def new_combiner(
    *,
    output_path: str,
    temp_path: str,
    save_path: Optional[str] = None,
    gvcf_paths: Optional[List[str]] = None,
    vds_paths: Optional[List[str]] = None,
    vds_sample_counts: Optional[List[int]] = None,
    intervals: Optional[List[Interval]] = None,
    import_interval_size: Optional[int] = None,
    use_genome_default_intervals: bool = False,
    use_exome_default_intervals: bool = False,
    gvcf_external_header: Optional[str] = None,
    gvcf_sample_names: Optional[List[str]] = None,
    gvcf_info_to_keep: Optional[Collection[str]] = None,
    gvcf_reference_entry_fields_to_keep: Optional[Collection[str]] = None,
    branch_factor: int = VariantDatasetCombiner.default_branch_factor,
    target_records: int = VariantDatasetCombiner.default_target_records,
    batch_size: int = VariantDatasetCombiner.default_gvcf_batch_size,
    reference_genome: Union[str, hl.ReferenceGenome] = 'default',
    contig_recoding: Optional[Dict[str, str]] = None,
    force: bool = False,
) -> VariantDatasetCombiner:
    if not (gvcf_paths or vds_paths):
        raise ValueError(
            "at least one  of 'gvcf_paths' or 'vds_paths' must be nonempty")
    if gvcf_paths is None:
        gvcf_paths = []
    if vds_paths is None:
        vds_paths = []
    if vds_sample_counts is not None and len(vds_paths) != len(
            vds_sample_counts):
        raise ValueError(
            "'vds_paths' and 'vds_sample_counts' (if present) must have the same length "
            f'{len(vds_paths)} != {len(vds_sample_counts)}')
    if (gvcf_sample_names is None) != (gvcf_external_header is None):
        raise ValueError(
            "both 'gvcf_sample_names' and 'gvcf_external_header' must be set or unset"
        )
    if gvcf_sample_names is not None and len(gvcf_sample_names) != len(
            gvcf_paths):
        raise ValueError(
            "'gvcf_sample_names' and 'gvcf_paths' must have the same length "
            f'{len(gvcf_sample_names)} != {len(gvcf_paths)}')

    n_partition_args = (int(intervals is not None) +
                        int(import_interval_size is not None) +
                        int(use_genome_default_intervals) +
                        int(use_exome_default_intervals))

    if n_partition_args == 0:
        raise ValueError(
            "'new_combiner': require one argument from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning"
        )

    def maybe_load_from_saved_path(
            save_path: str) -> Optional[VariantDatasetCombiner]:
        if force:
            return None
        fs = hl.current_backend().fs
        if fs.exists(save_path):
            try:
                combiner = load_combiner(save_path)
                warning(
                    f'found existing combiner plan at {save_path}, using it')
                # we overwrite these values as they are serialized, but not part of the
                # hash for an autogenerated name and we want users to be able to overwrite
                # these when resuming a combine (a common reason to need to resume a combine
                # is a failure due to branch factor being too large)
                combiner.branch_factor = branch_factor
                combiner.target_records = target_records
                combiner.gvcf_batch_size = batch_size
                return combiner
            except (ValueError, TypeError, OSError, KeyError):
                warning(
                    f'file exists at {save_path}, but it is not a valid combiner plan, overwriting'
                )
        return None

    # We do the first save_path check now after validating the arguments
    if save_path is not None:
        saved_combiner = maybe_load_from_saved_path(save_path)
        if saved_combiner is not None:
            return saved_combiner

    if n_partition_args > 1:
        warning(
            "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals'."
            "\n  The argument found first in the list in this warning will be used, and others ignored."
        )

    if intervals is not None:
        pass
    elif import_interval_size is not None:
        intervals = calculate_even_genome_partitioning(reference_genome,
                                                       import_interval_size)
    elif use_genome_default_intervals:
        size = VariantDatasetCombiner.default_genome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
    elif use_exome_default_intervals:
        size = VariantDatasetCombiner.default_exome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
    assert intervals is not None

    if isinstance(reference_genome, str):
        reference_genome = hl.get_reference(reference_genome)

    if gvcf_reference_entry_fields_to_keep is None and vds_paths:
        vds = hl.vds.read_vds(vds_paths[0])
        gvcf_reference_entry_fields_to_keep = set(
            vds.reference_data.entry) - {'END'}
    elif gvcf_reference_entry_fields_to_keep is None and gvcf_paths:
        mt = hl.import_vcf(gvcf_paths[0],
                           force_bgz=True,
                           reference_genome=reference_genome)
        mt = mt.filter_rows(hl.is_defined(mt.info.END))
        gvcf_reference_entry_fields_to_keep = defined_entry_fields(
            mt, 100_000) - {'GT', 'PGT', 'PL'}

    if save_path is None:
        sha = hashlib.sha256()
        sha.update(output_path.encode())
        sha.update(temp_path.encode())
        sha.update(str(reference_genome).encode())
        for path in vds_paths:
            sha.update(path.encode())
        for path in gvcf_paths:
            sha.update(path.encode())
        if gvcf_external_header is not None:
            sha.update(gvcf_external_header.encode())
        if gvcf_sample_names is not None:
            for name in gvcf_sample_names:
                sha.update(name.encode())
        if gvcf_info_to_keep is not None:
            for kept_info in sorted(gvcf_info_to_keep):
                sha.update(kept_info.encode())
        if gvcf_reference_entry_fields_to_keep is not None:
            for field in sorted(gvcf_reference_entry_fields_to_keep):
                sha.update(field.encode())
        if contig_recoding is not None:
            for key, value in sorted(contig_recoding.items()):
                sha.update(key.encode())
                sha.update(value.encode())
        for interval in intervals:
            sha.update(str(interval).encode())
        digest = sha.hexdigest()
        name = f'vds-combiner-plan_{digest}_{hl.__pip_version__}.json'
        save_path = os.path.join(temp_path, 'combiner-plans', name)
        saved_combiner = maybe_load_from_saved_path(save_path)
        if saved_combiner is not None:
            return saved_combiner
        else:
            warning(f'generated combiner save path of {save_path}')

    if vds_sample_counts:
        vdses = [
            VDSMetadata(path, n_samples)
            for path, n_samples in zip(vds_paths, vds_sample_counts)
        ]
    else:
        vdses = []
        for path in vds_paths:
            vds = hl.vds.read_vds(path)
            n_samples = vds.n_samples()
            vdses.append(VDSMetadata(path, n_samples))

    vdses.sort(key=lambda x: x.n_samples, reverse=True)

    return VariantDatasetCombiner(
        save_path=save_path,
        output_path=output_path,
        temp_path=temp_path,
        reference_genome=reference_genome,
        branch_factor=branch_factor,
        target_records=target_records,
        gvcf_batch_size=batch_size,
        contig_recoding=contig_recoding,
        vdses=vdses,
        gvcfs=gvcf_paths,
        gvcf_import_intervals=intervals,
        gvcf_external_header=gvcf_external_header,
        gvcf_sample_names=gvcf_sample_names,
        gvcf_info_to_keep=gvcf_info_to_keep,
        gvcf_reference_entry_fields_to_keep=gvcf_reference_entry_fields_to_keep
    )
Exemple #10
0
def run_combiner(sample_paths: List[str],
                 out_file: str,
                 tmp_path: str,
                 *,
                 intervals: Optional[List[hl.utils.Interval]] = None,
                 import_interval_size: Optional[int] = None,
                 use_genome_default_intervals: bool = False,
                 use_exome_default_intervals: bool = False,
                 header: Optional[str] = None,
                 sample_names: Optional[List[str]] = None,
                 branch_factor: int = CombinerConfig.default_branch_factor,
                 batch_size: int = CombinerConfig.default_batch_size,
                 target_records: int = CombinerConfig.default_target_records,
                 overwrite: bool = False,
                 reference_genome: str = 'default',
                 contig_recoding: Optional[Dict[str, str]] = None,
                 key_by_locus_and_alleles: bool = False):
    """Run the Hail VCF combiner, performing a hierarchical merge to create a combined sparse matrix table.

    **Partitioning**

    The partitioning of input GVCFs, which determines the maximum parallelism per file,
    is determined the four parameters below. One of these parameters must be passed to
    this function.

    - `intervals` -- User-supplied intervals.
    - `import_interval_size` -- Use intervals of this uniform size across the genome.
    - `use_genome_default_intervals` -- Use intervals of typical uniform size for whole
      genome GVCFs.
    - `use_exome_default_intervals` -- Use intervals of typical uniform size for exome
      GVCFs.

    It is recommended that new users include either `use_genome_default_intervals` or
    `use_exome_default_intervals`.

    Note also that the partitioning of the final, combined matrix table does not depend
    the GVCF input partitioning.

    Parameters
    ----------
    sample_paths : :obj:`list` of :class:`str`
        Paths to individual GVCFs.
    out_file : :class:`str`
        Path to final combined matrix table.
    tmp_path : :class:`str`
        Path for intermediate output.
    intervals : list of :class:`.Interval` or None
        Import GVCFs with specified partition intervals.
    import_interval_size : :obj:`int` or None
        Import GVCFs with uniform partition intervals of specified size.
    use_genome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        whole-genome data.
    use_exome_default_intervals : :obj:`bool`
        Import GVCFs with uniform partition intervals of default size for
        exome data.
    header : :class:`str` or None
        External header file to use as GVCF header for all inputs. If defined, `sample_names` must be defined as well.
    sample_names: list of :class:`str` or None
        Sample names, to be used with `header`.
    branch_factor : :obj:`int`
        Combiner branch factor.
    batch_size : :obj:`int`
        Combiner batch size.
    target_records : :obj:`int`
        Target records per partition in each combiner phase after the first.
    overwrite : :obj:`bool`
        Overwrite output file, if it exists.
    reference_genome : :class:`str`
        Reference genome for GVCF import.
    contig_recoding: :obj:`dict` of (:class:`str`, :obj:`str`), optional
        Mapping from contig name in gVCFs to contig name the reference
        genome.  All contigs must be present in the
        `reference_genome`, so this is useful for mapping
        differently-formatted data onto known references.
    key_by_locus_and_alleles : :obj:`bool`
        Key by both locus and alleles in the final output.

    Returns
    -------
    None

    """
    tmp_path += f'/combiner-temporary/{uuid.uuid4()}/'
    if header is not None:
        assert sample_names is not None
        assert len(sample_names) == len(sample_paths)

    n_partition_args = (int(intervals is not None) +
                        int(import_interval_size is not None) +
                        int(use_genome_default_intervals) +
                        int(use_exome_default_intervals))

    if n_partition_args == 0:
        raise ValueError(
            "'run_combiner': require one argument from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals' to choose GVCF partitioning"
        )
    if n_partition_args > 1:
        warning(
            "'run_combiner': multiple colliding arguments found from 'intervals', 'import_interval_size', "
            "'use_genome_default_intervals', or 'use_exome_default_intervals'."
            "\n  The argument found first in the list in this warning will be used, and others ignored."
        )

    if intervals is not None:
        info(
            f"Using {len(intervals)} user-supplied intervals as partitioning for GVCF import"
        )
    elif import_interval_size is not None:
        intervals = calculate_even_genome_partitioning(reference_genome,
                                                       import_interval_size)
        info(f"Using {len(intervals)} intervals with user-supplied size"
             f" {import_interval_size} as partitioning for GVCF import")
    elif use_genome_default_intervals:
        size = CombinerConfig.default_genome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default whole-genome size"
             f" {size} as partitioning for GVCF import")
    elif use_exome_default_intervals:
        size = CombinerConfig.default_exome_interval_size
        intervals = calculate_even_genome_partitioning(reference_genome, size)
        info(f"Using {len(intervals)} intervals with default exome size"
             f" {size} as partitioning for GVCF import")

    assert intervals is not None

    config = CombinerConfig(branch_factor=branch_factor,
                            batch_size=batch_size,
                            target_records=target_records)
    plan = config.plan(len(sample_paths))

    files_to_merge = sample_paths
    n_phases = len(plan.phases)
    total_ops = len(files_to_merge) * n_phases
    total_work_done = 0
    for phase_i, phase in enumerate(plan.phases):
        phase_i += 1  # used for info messages, 1-indexed for readability

        n_jobs = len(phase.jobs)
        merge_str = 'input GVCFs' if phase_i == 1 else 'intermediate sparse matrix tables'
        job_str = hl.utils.misc.plural('job', n_jobs)
        info(
            f"Starting phase {phase_i}/{n_phases}, merging {len(files_to_merge)} {merge_str} in {n_jobs} {job_str}."
        )

        if phase_i > 1:
            intervals = calculate_new_intervals(
                hl.read_matrix_table(files_to_merge[0]).rows(),
                config.target_records,
                reference_genome=reference_genome)

        new_files_to_merge = []

        for job_i, job in enumerate(phase.jobs):
            job_i += 1  # used for info messages, 1-indexed for readability

            n_merges = len(job.merges)
            merge_str = hl.utils.misc.plural('file', n_merges)
            pct_total = 100 * job.input_total_size / total_ops
            info(
                f"Starting phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)} to create {n_merges} merged {merge_str}, corresponding to ~{pct_total:.1f}% of total I/O."
            )
            merge_mts: List[MatrixTable] = []
            for merge in job.merges:
                inputs = [files_to_merge[i] for i in merge.inputs]

                if phase_i == 1:
                    mts = [
                        transform_gvcf(vcf) for vcf in hl.import_gvcfs(
                            inputs,
                            intervals,
                            array_elements_required=False,
                            _external_header=header,
                            _external_sample_ids=[[sample_names[i]]
                                                  for i in merge.inputs]
                            if header is not None else None,
                            reference_genome=reference_genome,
                            contig_recoding=contig_recoding)
                    ]
                else:
                    mts = [
                        hl.read_matrix_table(path, _intervals=intervals)
                        for path in inputs
                    ]

                merge_mts.append(combine_gvcfs(mts))

            if phase_i == n_phases:  # final merge!
                assert n_jobs == 1
                assert len(merge_mts) == 1
                [final_mt] = merge_mts

                if key_by_locus_and_alleles:
                    final_mt = MatrixTable(
                        MatrixKeyRowsBy(final_mt._mir, ['locus', 'alleles'],
                                        is_sorted=True))
                final_mt.write(out_file, overwrite=overwrite)
                new_files_to_merge = [out_file]
                info(
                    f"Finished phase {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, 100% of total I/O finished."
                )
                break

            tmp = f'{tmp_path}_phase{phase_i}_job{job_i}/'
            hl.experimental.write_matrix_tables(merge_mts, tmp, overwrite=True)
            pad = len(str(len(merge_mts)))
            new_files_to_merge.extend(tmp + str(n).zfill(pad) + '.mt'
                                      for n in range(len(merge_mts)))
            total_work_done += job.input_total_size
            info(
                f"Finished {phase_i}/{n_phases}, job {job_i}/{len(phase.jobs)}, {100 * total_work_done / total_ops:.1f}% of total I/O finished."
            )

        info(f"Finished phase {phase_i}/{n_phases}.")

        files_to_merge = new_files_to_merge

    assert files_to_merge == [out_file]

    info("Finished!")