def rename_duplicates(dataset, name='unique_id') -> MatrixTable: """Rename duplicate column keys. .. include:: ../_templates/req_tstring.rst Examples -------- >>> renamed = hl.rename_duplicates(dataset).cols() >>> duplicate_samples = (renamed.filter(renamed.s != renamed.unique_id) ... .select() ... .collect()) Notes ----- This method produces a new column field from the string column key by appending a unique suffix ``_N`` as necessary. For example, if the column key "NA12878" appears three times in the dataset, the first will produce "NA12878", the second will produce "NA12878_1", and the third will produce "NA12878_2". The name of this new field is parameterized by `name`. Parameters ---------- dataset : :class:`.MatrixTable` Dataset. name : :obj:`str` Name of new field. Returns ------- :class:`.MatrixTable` """ return MatrixTable._from_java(dataset._jmt.renameDuplicates(name))
def read_multiple_matrix_tables(self, paths: 'List[str]', intervals: 'List[hl.Interval]', intervals_type): json_repr = { 'paths': paths, 'intervals': intervals_type._convert_to_json(intervals), 'intervalPointType': intervals_type.element_type.point_type._parsable_string(), } results = self._jhc.backend().pyReadMultipleMatrixTables(json.dumps(json_repr)) return [MatrixTable._from_java(jm) for jm in results]
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable: """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - **proband** (:class:`.tstruct`) - Column fields on the proband. - **father** (:class:`.tstruct`) - Column fields on the father. - **mother** (:class:`.tstruct`) - Column fields on the mother. - **id** (:py:data:`.tstr`) - Column key for the proband. - **is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ return MatrixTable._from_java( dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
def window_by_locus(mt: MatrixTable, bp_window_size: int) -> MatrixTable: """Collect arrays of row and entry values from preceding loci. .. include:: ../_templates/req_tlocus.rst .. include:: ../_templates/experimental.rst Examples -------- >>> ds_result = hl.window_by_locus(ds, 3) Notes ----- This method groups each row (variant) with the previous rows in a window of `bp_window_size` base pairs, putting the row values from the previous variants into `prev_rows` (row field of type ``array<struct>``) and entry values from those variants into `prev_entries` (entry field of type ``array<struct>``). The `bp_window_size` argument is inclusive; if `base_pairs` is 2 and the loci are .. code-block:: text 1:100 1:100 1:102 1:102 1:103 2:100 2:101 then the size of `prev_rows` is 0, 1, 2, 3, 2, 0, and 1, respectively (and same for the size of prev_entries). Parameters ---------- mt : :class:`.MatrixTable` Input dataset. bp_window_size : :obj:`int` Base pairs to include in the backwards window (inclusive). Returns ------- :class:`.MatrixTable` """ require_first_key_field_locus(mt, 'window_by_locus') return MatrixTable._from_java(mt._jmt.windowVariants(bp_window_size))
def trio_matrix(dataset, pedigree, complete_trios=False) -> MatrixTable: """Builds and returns a matrix where columns correspond to trios and entries contain genotypes for the trio. .. include:: ../_templates/req_tstring.rst Examples -------- Create a trio matrix: >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) Notes ----- This method builds a new matrix table with one column per trio. If `complete_trios` is ``True``, then only trios that satisfy :meth:`.Trio.is_complete` are included. In this new dataset, the column identifiers are the sample IDs of the trio probands. The column fields and entries of the matrix are changed in the following ways: The new column fields consist of three structs (`proband`, `father`, `mother`), a Boolean field, and a string field: - **proband** (:class:`.tstruct`) - Column fields on the proband. - **father** (:class:`.tstruct`) - Column fields on the father. - **mother** (:class:`.tstruct`) - Column fields on the mother. - **id** (:py:data:`.tstr`) - Column key for the proband. - **is_female** (:py:data:`.tbool`) - Proband is female. ``True`` for female, ``False`` for male, missing if unknown. - **fam_id** (:py:data:`.tstr`) - Family ID. The new entry fields are: - **proband_entry** (:class:`.tstruct`) - Proband entry fields. - **father_entry** (:class:`.tstruct`) - Father entry fields. - **mother_entry** (:class:`.tstruct`) - Mother entry fields. Parameters ---------- pedigree : :class:`.Pedigree` Returns ------- :class:`.MatrixTable` """ return MatrixTable._from_java(dataset._jmt.trioMatrix(pedigree._jrep, complete_trios))
def unpersist_matrix_table(self, mt): return MatrixTable._from_java(self._to_java_ir(mt._mir).pyUnpersist())
def persist_matrix_table(self, mt, storage_level): return MatrixTable._from_java(self._to_java_ir(mt._mir).pyPersist(storage_level))
def unpersist_matrix_table(self, mt, storage_level): return MatrixTable._from_java(mt._jmt.unpersist())
def persist_matrix_table(self, mt, storage_level): return MatrixTable._from_java( self._jbackend.pyPersistMatrix(storage_level, self._to_java_matrix_ir(mt._mir)))
def unpersist_matrix_table(self, mt): return MatrixTable._from_java(mt._jmt.unpersist())
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True elif isinstance(point_type, tstruct) and is_struct_prefix(point_type, k_type): needs_wrapper = False else: raise TypeError("The point type is incompatible with key type of the dataset ('{}', '{}')".format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError("'filter_intervals' does not allow missing values in 'intervals'.") elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)] if isinstance(ds, MatrixTable): jmt = Env.hail().methods.MatrixFilterIntervals.apply(ds._jmt, intervals, keep) return MatrixTable._from_java(jmt) else: jt = Env.hail().methods.TableFilterIntervals.apply(ds._jt, intervals, keep) return Table._from_java(jt)
def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals, keep=False) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` or :class:`.Table` Dataset to filter. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. The point type of the interval must be a prefix of the key or equal to the first field of the key. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` or :class:`.Table` """ if isinstance(ds, MatrixTable): k_type = ds.row_key.dtype else: assert isinstance(ds, Table) k_type = ds.key.dtype point_type = intervals.dtype.element_type.point_type def is_struct_prefix(partial, full): if list(partial) != list(full)[:len(partial)]: return False for k, v in partial.items(): if full[k] != v: return False return True if point_type == k_type[0]: needs_wrapper = True elif isinstance(point_type, tstruct) and is_struct_prefix( point_type, k_type): needs_wrapper = False else: raise TypeError( "The point type is incompatible with key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(k_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in hl.eval(intervals)] if isinstance(ds, MatrixTable): jmt = Env.hail().methods.MatrixFilterIntervals.apply( ds._jmt, intervals, keep) return MatrixTable._from_java(jmt) else: jt = Env.hail().methods.TableFilterIntervals.apply( ds._jt, intervals, keep) return Table._from_java(jt)