class BlockMatrixFilter(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, indices_to_keep=sequenceof(sequenceof(int))) def __init__(self, child, indices_to_keep): super().__init__(child) self.child = child self.indices_to_keep = indices_to_keep def head_str(self): return _serialize_list( [_serialize_list(idxs) for idxs in self.indices_to_keep]) def _eq(self, other): return self.indices_to_keep == other.indices_to_keep def _compute_type(self): assert len(self.indices_to_keep) == 2 shape = [ len(idxs) if len(idxs) != 0 else self.child.typ.shape[i] for i, idxs in enumerate(self.indices_to_keep) ] tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( shape[0], shape[1]) self._type = tblockmatrix(self.child.typ.element_type, tensor_shape, is_row_vector, self.child.typ.block_size)
class ValueToBlockMatrix(BlockMatrixIR): @typecheck_method(child=IR, shape=sequenceof(int), block_size=int, dims_partitioned=sequenceof(bool)) def __init__(self, child, shape, block_size, dims_partitioned): super().__init__() self.child = child self.shape = shape self.block_size = block_size self.dims_partitioned = dims_partitioned def render(self, r): return '(ValueToBlockMatrix {} {} {} {})'.format( _serialize_ints(self.shape), self.block_size, _serialize_ints(self.dims_partitioned), r(self.child)) def _compute_type(self): child_type = self.child.typ if isinstance(child_type, tarray): element_type = child_type._element_type else: element_type = child_type assert len(self.shape) == 2 tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( self.shape[0], self.shape[1]) self._type = tblockmatrix(element_type, tensor_shape, is_row_vector, self.block_size, self.dims_partitioned)
class BlockMatrixBroadcast(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, in_index_expr=sequenceof(int), shape=sequenceof(int), block_size=int) def __init__(self, child, in_index_expr, shape, block_size): super().__init__(child) self.child = child self.in_index_expr = in_index_expr self.shape = shape self.block_size = block_size def head_str(self): return '{} {} {}'.format(_serialize_list(self.in_index_expr), _serialize_list(self.shape), self.block_size) def _eq(self, other): return self.in_index_expr == other.in_index_expr and \ self.shape == other.shape and \ self.block_size == other.block_size def _compute_type(self): assert len(self.shape) == 2 tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( self.shape[0], self.shape[1]) self._type = tblockmatrix(self.child.typ.element_type, tensor_shape, is_row_vector, self.block_size)
class BlockMatrixBroadcast(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, broadcast_kind=str, shape=sequenceof(int), block_size=int, dims_partitioned=sequenceof(bool)) def __init__(self, child, broadcast_kind, shape, block_size, dims_partitioned): super().__init__() self.child = child self.broadcast_kind = broadcast_kind self.shape = shape self.block_size = block_size self.dims_partitioned = dims_partitioned def render(self, r): return '(BlockMatrixBroadcast {} ({}) {} ({}) {})'\ .format(escape_str(self.broadcast_kind), ' '.join([str(x) for x in self.shape]), self.block_size, ' '.join([str(b) for b in self.dims_partitioned]), r(self.child)) def _compute_type(self): self._type = tblockmatrix(self.child.typ.element_type, self.shape, self.block_size, self.dims_partitioned)
class ValueToBlockMatrix(BlockMatrixIR): @typecheck_method(child=IR, shape=sequenceof(int), block_size=int, dims_partitioned=sequenceof(bool)) def __init__(self, child, shape, block_size, dims_partitioned): super().__init__() self.child = child self.shape = shape self.block_size = block_size self.dims_partitioned = dims_partitioned def render(self, r): return '(ValueToBlockMatrix ({}) {} ({}) {})'.format( ' '.join([str(x) for x in self.shape]), self.block_size, ' '.join([str(b) for b in self.dims_partitioned]), r(self.child)) def _compute_type(self): child_type = self.child.typ if isinstance(child_type, tarray): element_type = child_type._element_type else: element_type = child_type self._type = tblockmatrix(element_type, self.shape, self.block_size, self.dims_partitioned)
class BlockMatrixBroadcast(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, in_index_expr=sequenceof(int), shape=sequenceof(int), block_size=int, dims_partitioned=sequenceof(bool)) def __init__(self, child, in_index_expr, shape, block_size, dims_partitioned): super().__init__() self.child = child self.in_index_expr = in_index_expr self.shape = shape self.block_size = block_size self.dims_partitioned = dims_partitioned def render(self, r): return '(BlockMatrixBroadcast {} {} {} {} {})'\ .format(_serialize_ints(self.in_index_expr), _serialize_ints(self.shape), self.block_size, _serialize_ints(self.dims_partitioned), r(self.child)) def _compute_type(self): assert len(self.shape) == 2 tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( self.shape[0], self.shape[1]) self._type = tblockmatrix(self.child.typ.element_type, tensor_shape, is_row_vector, self.block_size, self.dims_partitioned)
class BlockMatrixAgg(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int)) def __init__(self, child, out_index_expr): super().__init__(child) self.child = child self.out_index_expr = out_index_expr def head_str(self): return _serialize_list(self.out_index_expr) def _eq(self, other): return self.out_index_expr == other.out_index_expr def _compute_type(self): child_matrix_shape = tensor_shape_to_matrix_shape(self.child) if self.out_index_expr == [0, 1]: is_row_vector = False shape = [] elif self.out_index_expr == [0]: is_row_vector = True shape = [child_matrix_shape[1]] elif self.out_index_expr == [1]: is_row_vector = False shape = [child_matrix_shape[0]] else: raise ValueError("Invalid out_index_expr") self._type = tblockmatrix(self.child.typ.element_type, shape, is_row_vector, self.child.typ.block_size)
class ValueToBlockMatrix(BlockMatrixIR): @typecheck_method(child=IR, shape=sequenceof(int), block_size=int) def __init__(self, child, shape, block_size): super().__init__(child) self.child = child self.shape = shape self.block_size = block_size def head_str(self): return '{} {}'.format(_serialize_list(self.shape), self.block_size) def _eq(self, other): return self.shape == other.shape and \ self.block_size == other.block_size def _compute_type(self): child_type = self.child.typ if isinstance(child_type, tarray): element_type = child_type._element_type else: element_type = child_type assert len(self.shape) == 2 tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( self.shape[0], self.shape[1]) self._type = tblockmatrix(element_type, tensor_shape, is_row_vector, self.block_size)
class BlockMatrixRandom(BlockMatrixIR): @typecheck_method(seed=int, gaussian=bool, shape=sequenceof(int), block_size=int) def __init__(self, seed, gaussian, shape, block_size): super().__init__() self.seed = seed self.gaussian = gaussian self.shape = shape self.block_size = block_size def head_str(self): return '{} {} {} {}'.format(self.seed, self.gaussian, _serialize_list(self.shape), self.block_size) def _eq(self, other): return self.seed == other.seed and \ self.gaussian == other.gaussian and \ self.shape == other.shape and \ self.block_size == other.block_size def _compute_type(self): assert len(self.shape) == 2 tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape( self.shape[0], self.shape[1]) self._type = tblockmatrix(hl.tfloat64, tensor_shape, is_row_vector, self.block_size)
class TableNativeReader(TableReader): @typecheck_method(path=str, intervals=nullable(sequenceof(anytype)), filter_intervals=bool) def __init__(self, path, intervals, filter_intervals): if intervals is not None: t = hl.expr.impute_type(intervals) if not isinstance(t, hl.tarray) and not isinstance( t.element_type, hl.tinterval): raise TypeError("'intervals' must be an array of tintervals") pt = t.element_type.point_type if isinstance(pt, hl.tstruct): self._interval_type = t else: self._interval_type = hl.tarray( hl.tinterval(hl.tstruct(__point=pt))) self.path = path self.filter_intervals = filter_intervals if intervals is not None and t != self._interval_type: self.intervals = [ hl.Interval(hl.Struct(__point=i.start), hl.Struct(__point=i.end), i.includes_start, i.includes_end) for i in intervals ] else: self.intervals = intervals def render(self): reader = {'name': 'TableNativeReader', 'path': self.path} if self.intervals is not None: assert self._interval_type is not None reader['options'] = { 'name': 'NativeReaderOptions', 'intervals': self._interval_type._convert_to_json(self.intervals), 'intervalPointType': self._interval_type.element_type.point_type._parsable_string(), 'filterIntervals': self.filter_intervals } return escape_str(json.dumps(reader)) def __eq__(self, other): return isinstance(other, TableNativeReader) and \ other.path == self.path and \ other.intervals == self.intervals and \ other.filter_intervals == self.filter_intervals
class BlockMatrixAgg(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int)) def __init__(self, child, out_index_expr): super().__init__() self.child = child self.out_index_expr = out_index_expr def render(self, r): return '(BlockMatrixAgg {} {})' \ .format(_serialize_list(self.out_index_expr), r(self.child)) def _compute_type(self): shape = [self.child.typ.shape[i] for i in self.out_index_expr] is_row_vector = self.out_index_expr == [1] self._type = tblockmatrix(self.child.typ.element_type, shape, is_row_vector, self.child.typ.block_size)
class StringTableReader(TableReader): @typecheck_method(paths=oneof(str, sequenceof(str)), min_partitions=nullable(int)) def __init__(self, paths, min_partitions): self.paths = paths self.min_partitions = min_partitions def render(self): reader = { 'name': 'StringTableReader', 'files': self.paths, 'minPartitions': self.min_partitions } return escape_str(json.dumps(reader)) def __eq__(self, other): return isinstance(other, StringTableReader) and \ other.path == self.path and \ other.min_partitions == self.min_partitions
class BlockMatrixSlice(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, slices=sequenceof(slice)) def __init__(self, child, slices): super().__init__(child) self.child = child self.slices = slices def head_str(self): return '{}'.format(_serialize_list([f'({s.start} {s.stop} {s.step})' for s in self.slices])) def _eq(self, other): return self.slices == other.slices def _compute_type(self): assert len(self.slices) == 2 matrix_shape = [1 + (s.stop - s.start - 1) // s.step for s in self.slices] tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(matrix_shape[0], matrix_shape[1]) self._type = tblockmatrix(self.child.typ.element_type, tensor_shape, is_row_vector, self.child.typ.block_size)
class BlockMatrixAgg(BlockMatrixIR): @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int)) def __init__(self, child, out_index_expr): super().__init__(child) self.child = child self.out_index_expr = out_index_expr def head_str(self): return _serialize_list(self.out_index_expr) def _eq(self, other): return self.out_index_expr == other.out_index_expr def _compute_type(self): shape = [ tensor_shape_to_matrix_shape(self.child)[i] for i in self.out_index_expr ] is_row_vector = self.out_index_expr == [1] self._type = tblockmatrix(self.child.typ.element_type, shape, is_row_vector, self.child.typ.block_size)
import hail as hl from hail.expr.expressions import expr_float64, expr_numeric, analyze from hail.typecheck import typecheck, oneof, sequenceof, nullable from hail.table import Table from hail.matrixtable import MatrixTable from hail.utils import wrap_to_list, new_temp_file import numpy as np @typecheck(weight_expr=expr_float64, ld_score_expr=expr_numeric, chi_sq_exprs=oneof(expr_float64, sequenceof(expr_float64)), n_samples_exprs=oneof(expr_numeric, sequenceof(expr_numeric)), n_blocks=int, two_step_threshold=int, n_reference_panel_variants=nullable(int)) def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model:
from functools import reduce import hail as hl from hail.expr.functions import _ndarray from hail.expr.functions import array as aarray from hail.expr.types import HailType, tfloat64, ttuple, tndarray from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple, expr_any, expr_array, expr_ndarray, expr_numeric, Int64Expression, cast_expr, construct_expr) from hail.expr.expressions.typed_expressions import NDArrayNumericExpression from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat tsequenceof_nd = oneof(sequenceof(expr_ndarray()), tupleof(expr_ndarray()), expr_array(expr_ndarray())) shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple()) def array(input_array, dtype=None): """Construct an :class:`.NDArrayExpression` Examples -------- >>> hl.eval(hl.nd.array([1, 2, 3, 4])) array([1, 2, 3, 4], dtype=int32) >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]])) array([[1, 2, 3], [4, 5, 6]], dtype=int32)
from typing import List from hail import MatrixTable from hail.ir import MatrixMultiWrite, MatrixNativeMultiWriter from hail.typecheck import sequenceof, typecheck from hail.utils.java import Env @typecheck(mts=sequenceof(MatrixTable), prefix=str, overwrite=bool, stage_locally=bool) def write_matrix_tables(mts: List[MatrixTable], prefix: str, overwrite: bool = False, stage_locally: bool = False): writer = MatrixNativeMultiWriter(prefix, overwrite, stage_locally) Env.backend().execute(MatrixMultiWrite([mt._mir for mt in mts], writer))
import numpy as np import hail as hl from hail.table import Table from hail.linalg import BlockMatrix from hail.typecheck import typecheck, nullable, sequenceof, oneof from hail.expr.expressions import expr_float64, expr_numeric, expr_locus from hail.utils import new_temp_file, wrap_to_list @typecheck(entry_expr=expr_float64, locus_expr=expr_locus(), radius=oneof(int, float), coord_expr=nullable(expr_float64), annotation_exprs=nullable(oneof(expr_numeric, sequenceof(expr_numeric))), block_size=nullable(int)) def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim',
class Pedigree(object): """Class containing a list of trios, with extra functionality. :param trios: list of trio objects to include in pedigree :type trios: list of :class:`.Trio` """ @typecheck_method(trios=sequenceof(Trio)) def __init__(self, trios): self._trios = tuple(trios) def __eq__(self, other): return isinstance(other, Pedigree) and self._trios == other._trios def __hash__(self): return hash(self._trios) def __iter__(self): return self._trios.__iter__() @classmethod @typecheck_method(fam_path=str, delimiter=str) def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree': """Read a PLINK .fam file and return a pedigree object. **Examples** >>> ped = hl.Pedigree.read('data/test.fam') Notes ------- See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for the required format. :param str fam_path: path to .fam file. :param str delimiter: Field delimiter. :rtype: :class:`.Pedigree` """ trios = [] missing_sex_count = 0 missing_sex_values = set() with Env.fs().open(fam_path) as file: for line in file: split_line = re.split(delimiter, line.strip()) num_fields = len(split_line) if num_fields != 6: raise FatalError( "Require 6 fields per line in .fam, but this line has {}: {}" .format(num_fields, line)) (fam, kid, dad, mom, sex, _) = tuple(split_line) # 1 is male, 2 is female, 0 is unknown. is_female = sex == "2" if sex == "1" or sex == "2" else None if is_female is None: missing_sex_count += 1 missing_sex_values.add(kid) trio = Trio(kid, fam if fam != "0" else None, dad if dad != "0" else None, mom if mom != "0" else None, is_female) trios.append(trio) only_ids = [trio.s for trio in trios] duplicate_ids = [ id for id, count in Counter(only_ids).items() if count > 1 ] if duplicate_ids: raise FatalError( "Invalid pedigree: found duplicate proband IDs\n{}".format( duplicate_ids)) if missing_sex_count > 0: warning( "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]" .format(missing_sex_count, missing_sex_values)) return Pedigree(trios) @property def trios(self): """List of trio objects in this pedigree. :rtype: list of :class:`.Trio` """ return self._trios def complete_trios(self): """List of trio objects that have a defined father and mother. :rtype: list of :class:`.Trio` """ return list(filter(lambda t: t.is_complete(), self.trios)) @typecheck_method(samples=sequenceof(nullable(str))) def filter_to(self, samples): """Filter the pedigree to a given list of sample IDs. **Notes** For any trio, the following steps will be applied: - If the proband is not in the list of samples provided, the trio is removed. - If the father is not in the list of samples provided, `pat_id` is set to ``None``. - If the mother is not in the list of samples provided, `mat_id` is set to ``None``. :param samples: list of sample IDs to keep :type samples: list of str :rtype: :class:`.Pedigree` """ sample_set = set(samples) filtered_trios = [] for trio in self._trios: restricted_trio = trio._restrict_to(sample_set) if restricted_trio is not None: filtered_trios.append(restricted_trio) return Pedigree(filtered_trios) @typecheck_method(path=str) def write(self, path): """Write a .fam file to the given path. **Examples** >>> ped = hl.Pedigree.read('data/test.fam') >>> ped.write('output/out.fam') **Notes** This method writes a `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_. .. caution:: Phenotype information is not preserved in the Pedigree data structure in Hail. Reading and writing a PLINK .fam file will result in loss of this information. Use the key table method :meth:`~hail.KeyTable.import_fam` to manipulate this information. :param path: output path :type path: str """ lines = [t._to_fam_file_line() for t in self._trios] with Env.fs().open(path, mode="w") as file: for line in lines: file.write(line + "\n")
from typing import List, Optional from hail import MatrixTable from hail.linalg import BlockMatrix from hail.ir import MatrixMultiWrite, MatrixNativeMultiWriter, BlockMatrixMultiWrite, BlockMatrixBinaryMultiWriter, BlockMatrixTextMultiWriter, BlockMatrixNativeMultiWriter from hail.typecheck import nullable, sequenceof, typecheck, enumeration from hail.utils.java import Env @typecheck(mts=sequenceof(MatrixTable), prefix=str, overwrite=bool, stage_locally=bool) def write_matrix_tables(mts: List[MatrixTable], prefix: str, overwrite: bool = False, stage_locally: bool = False): length = len(str(len(mts) - 1)) paths = [ f"{prefix}{str(i).rjust(length, '0')}.mt" for i in range(len(mts)) ] writer = MatrixNativeMultiWriter(paths, overwrite, stage_locally) Env.backend().execute(MatrixMultiWrite([mt._mir for mt in mts], writer)) @typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool) def block_matrices_tofiles(bms: List[BlockMatrix], prefix: str, overwrite: bool = False): writer = BlockMatrixBinaryMultiWriter(prefix, overwrite) Env.backend().execute(
class ttable(object): @staticmethod def _from_java(jtt): return ttable(dtype(jtt.globalType().toString()), dtype(jtt.rowType().toString()), jiterable_to_list(jtt.key())) @staticmethod def _from_json(json): return ttable(dtype(json['global']), dtype(json['row']), json['row_key']) @typecheck_method(global_type=tstruct, row_type=tstruct, row_key=sequenceof(str)) def __init__(self, global_type, row_type, row_key): self.global_type = global_type self.row_type = row_type self.row_key = row_key def __eq__(self, other): return (isinstance(other, ttable) and self.global_type == other.global_type and self.row_type == other.row_type and self.row_key == other.row_key) def __hash__(self): return 43 + hash(str(self)) def __repr__(self): return f'ttable(global_type={self.global_type!r}, row_type={self.row_type!r}, row_key={self.row_key!r})' def _key_str(self): return ', '.join([escape_parsable(k) for k in self.row_key]) def __str__(self): return f'table {{global: {self.global_type}, row: {self.row_type}, row_key: [{self._key_str()}]}}' def pretty(self, indent=0, increment=4): l = [] l.append(' ' * indent) l.append('table {\n') indent += increment l.append(' ' * indent) l.append('global: ') self.global_type._pretty(l, indent, increment) l.append(',\n') l.append(' ' * indent) l.append('row: ') self.row_type._pretty(l, indent, increment) l.append(',\n') l.append(' ' * indent) l.append(f'row_key: [{self._key_str()}]\n') indent -= increment l.append(' ' * indent) l.append('}') return ''.join(l) @property def key_type(self): return self.row_type._select_fields(self.row_key) @property def value_type(self): return self.row_type._drop_fields(set(self.row_key)) def _rename(self, global_map, row_map): return ttable(self.global_type._rename(global_map), self.row_type._rename(row_map), [row_map.get(k, k) for k in self.row_key]) def row_env(self, default_value=None): if default_value is None: return {'global': self.global_type, 'row': self.row_type} else: return {'global': default_value, 'row': default_value} def global_env(self, default_value=None): if default_value is None: return {'global': self.global_type} else: return {'global': default_value}
includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval( hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht @typecheck(gene_symbols=nullable(sequenceof(str)), gene_ids=nullable(sequenceof(str)), transcript_ids=nullable(sequenceof(str)), verbose=bool, reference_genome=nullable(reference_genome_type), gtf_file=nullable(str)) def get_gene_intervals(gene_symbols=None, gene_ids=None, transcript_ids=None, verbose=True, reference_genome=None, gtf_file=None): """Get intervals of genes or transcripts. Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.
class tblockmatrix(object): @staticmethod def _from_java(jtbm): return tblockmatrix(dtype(jtbm.elementType().toString()), jiterable_to_list(jtbm.shape()), jtbm.isRowVector(), jtbm.blockSize()) @staticmethod def _from_json(json): return tblockmatrix(dtype(json['element_type']), json['shape'], json['is_row_vector'], json['block_size']) @typecheck_method(element_type=hail_type, shape=sequenceof(int), is_row_vector=bool, block_size=int) def __init__(self, element_type, shape, is_row_vector, block_size): self.element_type = element_type self.shape = shape self.is_row_vector = is_row_vector self.block_size = block_size def __eq__(self, other): return isinstance(other, tblockmatrix) and \ self.element_type == other.element_type and \ self.shape == other.shape and \ self.is_row_vector == other.is_row_vector and \ self.block_size == other.block_size def __hash__(self): return 43 + hash(str(self)) def __repr__(self): return f'tblockmatrix(element_type={self.element_type!r}, shape={self.shape!r}, ' \ f'is_row_vector={self.is_row_vector!r}, block_size={self.block_size!r})' def __str__(self): return f'blockmatrix {{element_type: {self.element_type}, shape: {self.shape}, ' \ f'is_row_vector: {self.is_row_vector}, block_size: {self.block_size})' def pretty(self, indent=0, increment=4): l = [] l.append(' ' * indent) l.append('blockmatrix {\n') indent += increment l.append(' ' * indent) l.append('element_type: ') self.element_type._pretty(l, indent, increment) l.append(',\n') l.append(' ' * indent) l.append(f'shape: [{self.shape}],\n') l.append(' ' * indent) l.append('is_row_vector: ') self.is_row_vector._pretty(l, indent, increment) l.append(',\n') l.append(' ' * indent) l.append('block_size: ') self.block_size._pretty(l, indent, increment) l.append(',\n') indent -= increment l.append(' ' * indent) l.append('}') return ''.join(l)
**tm.proband_entry, **{phased_call_field: tm.__phased_GT[0]} ), father_entry=hl.struct( **tm.father_entry, **{phased_call_field: tm.__phased_GT[1]} ), mother_entry=hl.struct( **tm.mother_entry, **{phased_call_field: tm.__phased_GT[2]} ) ) @typecheck(tm=MatrixTable, col_keys=sequenceof(str), keep_trio_cols=bool, keep_trio_entries=bool) def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s'], keep_trio_cols: bool = True, keep_trio_entries: bool = False) -> hl.MatrixTable: """Splits a trio MatrixTable back into a sample MatrixTable. Example ------- >>> # Create a trio matrix from a sample matrix >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) >>> # Explode trio matrix back into a sample matrix >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset) Notes
}) def localize(mt): if isinstance(mt, MatrixTable): return mt._localize_entries('__entries', '__cols') return mt def unlocalize(mt): if isinstance(mt, Table): return mt._unlocalize_entries('__entries', '__cols', ['s']) return mt @typecheck(mt=oneof(Table, MatrixTable), info_to_keep=sequenceof(str)) def transform_gvcf(mt, info_to_keep=[]) -> Table: """Transforms a gvcf into a sparse matrix table The input to this should be some result of either :func:`.import_vcf` or :func:`.import_gvcfs` with ``array_elements_required=False``. There is an assumption that this function will be called on a matrix table with one column (or a localized table version of the same). Parameters ---------- mt : :obj:`Union[Table, MatrixTable]` The gvcf being transformed, if it is a table, then it must be a localized matrix table with the entries array named ``__entries`` info_to_keep : :obj:`List[str]`
class tmatrix(object): @staticmethod def _from_java(jtt): return tmatrix(dtype(jtt.globalType().toString()), dtype(jtt.colType().toString()), jiterable_to_list(jtt.colKey()), dtype(jtt.rowType().toString()), jiterable_to_list(jtt.rowKey()), dtype(jtt.entryType().toString())) @staticmethod def _from_json(json): return tmatrix(dtype(json['global']), dtype(json['col']), json['col_key'], dtype(json['row']), json['row_key'], dtype(json['entry'])) @typecheck_method(global_type=tstruct, col_type=tstruct, col_key=sequenceof(str), row_type=tstruct, row_key=sequenceof(str), entry_type=tstruct) def __init__(self, global_type, col_type, col_key, row_type, row_key, entry_type): self.global_type = global_type self.col_type = col_type self.col_key = col_key self.row_type = row_type self.row_key = row_key self.entry_type = entry_type def __eq__(self, other): return (isinstance(other, tmatrix) and self.global_type == other.global_type and self.col_type == other.col_type and self.col_key == other.col_key and self.row_type == other.row_type and self.row_key == other.row_key and self.entry_type == other.entry_type) def __hash__(self): return 43 + hash(str(self)) def __repr__(self): return f'tmatrix(global_type={self.global_type!r}, col_type={self.col_type!r}, col_key={self.col_key!r}, row_type={self.row_type!r}, row_key={self.row_key!r}, entry_type={self.entry_type!r})' def _row_key_str(self): return ', '.join([escape_parsable(k) for k in self.row_key]) def _col_key_str(self): return ', '.join([escape_parsable(k) for k in self.col_key]) def __str__(self): return f'matrix {{global: {self.global_type}, col: {self.col_type}, col_key: {self._col_key_str()}, row: {self.row_type}, row_key: [{self._row_key_str()}], entry: {self.entry_type}}}' def pretty(self, indent=0, increment=4): b = [] b.append(' ' * indent) b.append('matrix {\n') indent += increment b.append(' ' * indent) b.append('global: ') self.global_type._pretty(b, indent, increment) b.append(',\n') b.append(' ' * indent) b.append('row: ') self.row_type._pretty(b, indent, increment) b.append(',\n') b.append(' ' * indent) b.append(f'row_key: [{self._row_key_str()}],\n') b.append(' ' * indent) b.append('col: ') self.col_type._pretty(b, indent, increment) b.append(',\n') b.append(' ' * indent) b.append(f'col_key: [{self._col_key_str()}],\n') b.append(' ' * indent) b.append('entry: ') self.entry_type._pretty(b, indent, increment) b.append('\n') indent -= increment b.append(' ' * indent) b.append('}') return ''.join(b) @property def col_key_type(self): return self.col_type._select_fields(self.col_key) @property def col_value_type(self): return self.col_type._drop_fields(set(self.col_key)) @property def row_key_type(self): return self.row_type._select_fields(self.row_key) @property def row_value_type(self): return self.row_type._drop_fields(set(self.row_key)) def _rename(self, global_map, col_map, row_map, entry_map): return tmatrix(self.global_type._rename(global_map), self.col_type._rename(col_map), [col_map.get(k, k) for k in self.col_key], self.row_type._rename(row_map), [row_map.get(k, k) for k in self.row_key], self.entry_type._rename(entry_map)) def global_env(self, default_value=None): if default_value is None: return {'global': self.global_type} else: return {'global': default_value} def row_env(self, default_value=None): if default_value is None: return {'global': self.global_type, 'va': self.row_type} else: return {'global': default_value, 'va': default_value} def col_env(self, default_value=None): if default_value is None: return {'global': self.global_type, 'sa': self.col_type} else: return {'global': default_value, 'sa': default_value} def entry_env(self, default_value=None): if default_value is None: return { 'global': self.global_type, 'va': self.row_type, 'sa': self.col_type, 'g': self.entry_type } else: return { 'global': default_value, 'va': default_value, 'sa': default_value, 'g': default_value }
class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37, Genome Reference Consortium Human Build 37 - GRCh38, Genome Reference Consortium Human Build 38 - GRCm38, Genome Reference Consortium Mouse Build 38 - CanFam3, Canis lupus familiaris (dog) You can access these reference genome objects using :func:`~hail.get_reference`: >>> rg = hl.get_reference('GRCh37') >>> rg = hl.get_reference('GRCh38') >>> rg = hl.get_reference('GRCm38') >>> rg = hl.get_reference('CanFam3') Note that constructing a new reference genome, either by using the class constructor or by using `read` will add the reference genome to the list of known references; it is possible to access the reference genome using :func:`~hail.get_reference` anytime afterwards. Note ---- Reference genome names must be unique. It is not possible to overwrite the built-in reference genomes. Parameters ---------- name : :class:`str` Name of reference. Must be unique and NOT one of Hail's predefined references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'`` and ``'default'``. contigs : :obj:`list` of :class:`str` Contig names. lengths : :obj:`dict` of :class:`str` to :obj:`int` Dict of contig names to contig lengths. x_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) """ _references = {} @classmethod def _from_config(cls, config, _builtin=False): def par_tuple(p): assert p['start']['contig'] == p['end']['contig'] return (p['start']['contig'], p['start']['position'], p['end']['position']) contigs = config['contigs'] return ReferenceGenome(config['name'], [c['name'] for c in contigs], {c['name']: c['length'] for c in contigs}, config['xContigs'], config['yContigs'], config['mtContigs'], [par_tuple(p) for p in config['par']], _builtin) @typecheck_method(name=str, contigs=sequenceof(str), lengths=dictof(str, int), x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int)), _builtin=bool) def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{ 'name': c, 'length': l } for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{ 'start': { 'contig': c, 'position': s }, 'end': { 'contig': c, 'position': e } } for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [ hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par ] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) self._sequence_files = None self._liftovers = dict() def __str__(self): return self._config['name'] def __repr__(self): return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \ (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple) def __eq__(self, other): return isinstance(other, ReferenceGenome) and self._config == other._config def __hash__(self): return hash(self.name) @property def name(self): """Name of reference genome. Returns ------- :class:`str` """ return self._config['name'] @property def contigs(self): """Contig names. Returns ------- :obj:`list` of :class:`str` """ return self._contigs @property def lengths(self): """Dict of contig name to contig length. Returns ------- :obj:`dict` of :class:`str` to :obj:`int` """ return self._lengths @property def x_contigs(self): """X contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['xContigs'] @property def y_contigs(self): """Y contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['yContigs'] @property def mt_contigs(self): """Mitochondrial contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['mtContigs'] @property def par(self): """Pseudoautosomal regions. Returns ------- :obj:`list` of :class:`.Interval` """ return self._par @typecheck_method(contig=str) def contig_length(self, contig): """Contig length. Parameters ---------- contig : :class:`str` Contig name. Returns ------- :obj:`int` Length of contig. """ if contig in self.lengths: return self.lengths[contig] else: raise KeyError( "Contig `{}' is not in reference genome.".format(contig)) @typecheck_method(contig=str) def _contig_global_position(self, contig): if self._global_positions is None: gp = {} lengths = self._lengths x = 0 for c in self.contigs: gp[c] = x x += lengths[c] self._global_positions = gp return self._global_positions[contig] @classmethod @typecheck_method(path=str) def read(cls, path): """Load reference genome from a JSON file. Notes ----- The JSON file must have the following format: .. code-block:: text {"name": "my_reference_genome", "contigs": [{"name": "1", "length": 10000000}, {"name": "2", "length": 20000000}, {"name": "X", "length": 19856300}, {"name": "Y", "length": 78140000}, {"name": "MT", "length": 532}], "xContigs": ["X"], "yContigs": ["Y"], "mtContigs": ["MT"], "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}}, {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}] } `name` must be unique and not overlap with Hail's pre-instantiated references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'``, and ``'default'``. The contig names in `xContigs`, `yContigs`, and `mtContigs` must be present in `contigs`. The intervals listed in `par` must have contigs in either `xContigs` or `yContigs` and must have positions between 0 and the contig length given in `contigs`. Parameters ---------- path : :class:`str` Path to JSON file. Returns ------- :class:`.ReferenceGenome` """ with hl.hadoop_open(path) as f: return ReferenceGenome._from_config(json.load(f)) @typecheck_method(output=str) def write(self, output): """"Write this reference genome to a file in JSON format. Examples -------- >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200}) >>> my_rg.write(f"output/new_reference.json") Notes ----- Use :meth:`~hail.genetics.ReferenceGenome.read` to reimport the exported reference genome in a new HailContext session. Parameters ---------- output : :class:`str` Path of JSON file to write. """ with hl.utils.hadoop_open(output, 'w') as f: json.dump(self._config, f) @typecheck_method(fasta_file=str, index_file=nullable(str)) def add_sequence(self, fasta_file, index_file=None): """Load the reference sequence from a FASTA file. Examples -------- Access the GRCh37 reference genome using :func:`~hail.get_reference`: >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP Add a sequence file: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz', ... 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP Add a sequence file with the default index location: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for some of Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :class:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`None` or :class:`str` Path to FASTA index file. Must be uncompressed. If `None`, replace the fasta_file's extension with `fai`. """ if index_file is None: index_file = re.sub(r'\.[^.]*$', '.fai', fasta_file) Env.backend().add_sequence(self.name, fasta_file, index_file) self._sequence_files = (fasta_file, index_file) def has_sequence(self): """True if the reference sequence has been loaded. Returns ------- :obj:`bool` """ return self._sequence_files is not None def remove_sequence(self): """Remove the reference sequence.""" self._sequence_files = None Env.backend().remove_sequence(self.name) @classmethod @typecheck_method(name=str, fasta_file=str, index_file=str, x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int))) def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :class:`str` Name for new reference genome. fasta_file : :class:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :class:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ par_strings = [ "{}:{}-{}".format(contig, start, end) for (contig, start, end) in par ] Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings) rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True) rg._sequence_files = (fasta_file, index_file) return rg @typecheck_method(dest_reference_genome=reference_genome_type) def has_liftover(self, dest_reference_genome): """``True`` if a liftover chain file is available from this reference genome to the destination reference. Parameters ---------- dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` Returns ------- :obj:`bool` """ return dest_reference_genome.name in self._liftovers @typecheck_method(dest_reference_genome=reference_genome_type) def remove_liftover(self, dest_reference_genome): """Remove liftover to `dest_reference_genome`. Parameters ---------- dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` """ if dest_reference_genome.name in self._liftovers: del self._liftovers[dest_reference_genome.name] Env.backend().remove_liftover(self.name, dest_reference_genome.name) @typecheck_method(chain_file=str, dest_reference_genome=reference_genome_type) def add_liftover(self, chain_file, dest_reference_genome): """Register a chain file for liftover. Examples -------- Access GRCh37 and GRCh38 using :func:`~hail.get_reference`: >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP Add a chain file from 37 to 38: >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_liftover` to test whether a chain file has been registered. The chain file format is described `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__. Chain files are hosted on google cloud for some of Hail's built-in references: **GRCh37 to GRCh38** gs://hail-common/references/grch37_to_grch38.over.chain.gz **GRCh38 to GRCh37** gs://hail-common/references/grch38_to_grch37.over.chain.gz Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- chain_file : :class:`str` Path to chain file. Can be compressed (GZIP) or uncompressed. dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` Reference genome to convert to. """ Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name) if dest_reference_genome.name in self._liftovers: raise KeyError( f"Liftover already exists from {self.name} to {dest_reference_genome.name}." ) self._liftovers[dest_reference_genome.name] = chain_file
from functools import reduce import hail as hl from hail.expr.functions import _ndarray from hail.expr.functions import array as aarray from hail.expr.types import HailType, tfloat64, ttuple, tndarray from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple, expr_any, expr_array, expr_ndarray, expr_numeric, Int64Expression, cast_expr, construct_expr) from hail.expr.expressions.typed_expressions import NDArrayNumericExpression from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat, NDArraySVD, Apply tsequenceof_nd = oneof(sequenceof(expr_ndarray()), expr_array(expr_ndarray())) shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple()) def array(input_array, dtype=None): """Construct an :class:`.NDArrayExpression` Examples -------- >>> hl.eval(hl.nd.array([1, 2, 3, 4])) array([1, 2, 3, 4], dtype=int32) >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]])) array([[1, 2, 3], [4, 5, 6]], dtype=int32)
_col_val=hl.array([hl.array([field, ht[field]]) for field in fields])) ht = ht.drop(*fields) ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp) @typecheck(ht=Table, field=str, value=str, key=nullable(oneof(str, sequenceof(str)))) def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``.
ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp) @typecheck(ht=Table, field=str, value=str, key=nullable(oneof(str, sequenceof(str)))) def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``.