class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37 - GRCh38 - GRCm38 You can access these reference genome objects using :func:`.get_reference`: >>> rg = hl.get_reference('GRCh37') Note that constructing a new reference genome, either by using the class constructor or by using :meth:`.ReferenceGenome.read` will add the reference genome to the list of known references; it is possible to access the reference genome using :func:`.get_reference` anytime afterwards. Note ---- Reference genome names must be unique. It is not possible to overwrite the built-in reference genomes. Parameters ---------- name : :obj:`str` Name of reference. Must be unique and NOT one of Hail's predefined references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``. contigs : :obj:`list` of :obj:`str` Contig names. lengths : :obj:`dict` of :obj:`str` to :obj:`int` Dict of contig names to contig lengths. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) """ _references = {} @classmethod def _from_config(cls, config, _builtin=False): def par_tuple(p): assert p['start']['contig'] == p['end']['contig'] return (p['start']['contig'], p['start']['position'], p['end']['position']) contigs = config['contigs'] return ReferenceGenome(config['name'], [c['name'] for c in contigs], {c['name']: c['length'] for c in contigs}, config['xContigs'], config['yContigs'], config['mtContigs'], [par_tuple(p) for p in config['par']], _builtin) @typecheck_method(name=str, contigs=sequenceof(str), lengths=dictof(str, int), x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int)), _builtin=bool) def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set() def __str__(self): return self._config['name'] def __repr__(self): return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \ (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple) def __eq__(self, other): return isinstance(other, ReferenceGenome) and self._config == other._config def __hash__(self): return hash(self.name) @property def name(self): """Name of reference genome. Returns ------- :obj:`str` """ return self._config['name'] @property def contigs(self): """Contig names. Returns ------- :obj:`list` of :obj:`str` """ return self._contigs @property def lengths(self): """Dict of contig name to contig length. Returns ------- :obj:`list` of :obj:`str` """ return self._lengths @property def x_contigs(self): """X contigs. Returns ------- :obj:`list` of :obj:`str` """ return self._config['xContigs'] @property def y_contigs(self): """Y contigs. Returns ------- :obj:`list` of :obj:`str` """ return self._config['yContigs'] @property def mt_contigs(self): """Mitochondrial contigs. Returns ------- :obj:`list` of :obj:`str` """ return self._config['mtContigs'] @property def par(self): """Pseudoautosomal regions. Returns ------- :obj:`list` of :class:`.Interval` """ return self._par @typecheck_method(contig=str) def contig_length(self, contig): """Contig length. Parameters ---------- contig : :obj:`str` Contig name. Returns ------- :obj:`int` Length of contig. """ if contig in self.lengths: return self.lengths[contig] else: raise KeyError("Contig `{}' is not in reference genome.".format(contig)) @typecheck_method(contig=str) def _contig_global_position(self, contig): if self._global_positions is None: gp = {} lengths = self._lengths x = 0 for c in self.contigs: gp[c] = x x += lengths[c] self._global_positions = gp return self._global_positions[contig] @classmethod @typecheck_method(path=str) def read(cls, path): """Load reference genome from a JSON file. Notes ----- The JSON file must have the following format: .. code-block:: text {"name": "my_reference_genome", "contigs": [{"name": "1", "length": 10000000}, {"name": "2", "length": 20000000}, {"name": "X", "length": 19856300}, {"name": "Y", "length": 78140000}, {"name": "MT", "length": 532}], "xContigs": ["X"], "yContigs": ["Y"], "mtContigs": ["MT"], "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}}, {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}] } `name` must be unique and not overlap with Hail's pre-instantiated references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``. The contig names in `xContigs`, `yContigs`, and `mtContigs` must be present in `contigs`. The intervals listed in `par` must have contigs in either `xContigs` or `yContigs` and must have positions between 0 and the contig length given in `contigs`. Parameters ---------- path : :obj:`str` Path to JSON file. Returns ------- :class:`.ReferenceGenome` """ with hl.hadoop_open(path) as f: return ReferenceGenome._from_config(json.load(f)) @typecheck_method(output=str) def write(self, output): """"Write this reference genome to a file in JSON format. Examples -------- >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200}) >>> my_rg.write("output/new_reference.json") Notes ----- Use :class:`~hail.ReferenceGenome.read` to reimport the exported reference genome in a new HailContext session. Parameters ---------- output : :obj:`str` Path of JSON file to write. """ with hl.utils.hadoop_open(output, 'w') as f: json.dump(self._config, f) @typecheck_method(fasta_file=str, index_file=nullable(str)) def add_sequence(self, fasta_file, index_file=None): """Load the reference sequence from a FASTA file. Examples -------- Access the GRCh37 reference genome using :func:`.get_reference`: >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP Add a sequence file: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz', ... 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP Add a sequence file with the default index location: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for some of Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`None` or :obj:`str` Path to FASTA index file. Must be uncompressed. If `None`, replace the fasta_file's extension with `fai`. """ if index_file is None: index_file = re.sub(r'\.[^.]*$', '.fai', fasta_file) Env.backend().add_sequence(self.name, fasta_file, index_file) self._has_sequence = True def has_sequence(self): """True if the reference sequence has been loaded. Returns ------- :obj:`bool` """ return self._has_sequence def remove_sequence(self): """Remove the reference sequence. Returns ------- :obj:`bool` """ self._has_sequence = False Env.backend().remove_sequence(self.name) @classmethod @typecheck_method(name=str, fasta_file=str, index_file=str, x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int))) def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :obj:`str` Name for new reference genome. fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par] Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings) rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True) rg._has_sequence = True return rg @typecheck_method(dest_reference_genome=reference_genome_type) def has_liftover(self, dest_reference_genome): """``True`` if a liftover chain file is available from this reference genome to the destination reference. Parameters ---------- dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` Returns ------- :obj:`bool` """ return dest_reference_genome.name in self._liftovers @typecheck_method(dest_reference_genome=reference_genome_type) def remove_liftover(self, dest_reference_genome): """Remove liftover to `dest_reference_genome`. Parameters ---------- dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` """ if dest_reference_genome.name in self._liftovers: self._liftovers.remove(dest_reference_genome.name) Env.backend().remove_liftover(self.name, dest_reference_genome.name) @typecheck_method(chain_file=str, dest_reference_genome=reference_genome_type) def add_liftover(self, chain_file, dest_reference_genome): """Register a chain file for liftover. Examples -------- Access GRCh37 and GRCh38 using :func:`.get_reference`: >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP Add a chain file from 37 to 38: >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_liftover` to test whether a chain file has been registered. The chain file format is described `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__. Chain files are hosted on google cloud for some of Hail's built-in references: **GRCh37 to GRCh38** gs://hail-common/references/grch37_to_grch38.over.chain.gz **GRCh38 to GRCh37** gs://hail-common/references/grch38_to_grch37.over.chain.gz Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- chain_file : :obj:`str` Path to chain file. Can be compressed (GZIP) or uncompressed. dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` Reference genome to convert to. """ Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name) self._liftovers.add(dest_reference_genome.name) hl.ir.register_liftover_functions(self.name, dest_reference_genome.name)
class tndarray(HailType): """Hail type for n-dimensional arrays. .. include:: _templates/experimental.rst In Python, these are represented as NumPy :obj:`ndarray`. Notes ----- NDArrays contain elements of only one type, which is parameterized by `element_type`. Parameters ---------- element_type : :class:`.HailType` Element type of array. ndim : int32 Number of dimensions. See Also -------- :class:`.NDArrayExpression`, :func:`.ndarray` """ @typecheck_method(element_type=hail_type, ndim=oneof(NatBase, int)) def __init__(self, element_type, ndim): self._element_type = element_type self._ndim = NatLiteral(ndim) if isinstance(ndim, int) else ndim super(tndarray, self).__init__() @property def element_type(self): """NDArray element type. Returns ------- :class:`.HailType` Element type. """ return self._element_type @property def ndim(self): """NDArray number of dimensions. Returns ------- :obj:`int` Number of dimensions. """ assert isinstance( self._ndim, NatLiteral ), "tndarray must be realized with a concrete number of dimensions" return self._ndim.n def _traverse(self, obj, f): if f(self, obj): for elt in np.nditer(obj, ['zerosize_ok']): self.element_type._traverse(elt.item(), f) def _typecheck_one_level(self, annotation): if annotation is not None and not isinstance(annotation, np.ndarray): raise TypeError( "type 'ndarray' expected Python 'numpy.ndarray', but found type '%s'" % type(annotation)) def __str__(self): return "ndarray<{}, {}>".format(self.element_type, self.ndim) def _eq(self, other): return isinstance(other, tndarray) and self.element_type == other.element_type def _pretty(self, l, indent, increment): l.append('ndarray<') self._element_type._pretty(l, indent, increment) l.append(', ') l.append(str(self.ndim)) l.append('>') def _parsable_string(self): return f'NDArray[{self._element_type._parsable_string()},{self.ndim}]' def _convert_from_json(self, x): np_type = self.element_type.to_numpy() return np.ndarray(shape=x['shape'], buffer=np.array(x['data'], dtype=np_type), strides=x['strides'], dtype=np_type) def _convert_to_json(self, x): data = x.flatten("F").tolist() strides = [] axis_one_step_byte_size = x.itemsize for dimension_size in x.shape: strides.append(axis_one_step_byte_size) axis_one_step_byte_size *= (dimension_size if dimension_size > 0 else 1) json_dict = {"shape": x.shape, "strides": strides, "data": data} return json_dict def clear(self): self._element_type.clear() self._ndim.clear() def unify(self, t): return isinstance(t, tndarray) and \ self._element_type.unify(t._element_type) and \ self._ndim.unify(t._ndim) def subst(self): return tndarray(self._element_type.subst(), self._ndim.subst()) def _get_context(self): return self.element_type.get_context()
class Locus(object): """An object that represents a location in the genome. Parameters ---------- contig : :class:`str` Chromosome identifier. position : :obj:`int` Chromosomal position (1-indexed). reference_genome : :class:`str` or :class:`.ReferenceGenome` Reference genome to use. Note ---- This object refers to the Python value returned by taking or collecting Hail expressions, e.g. ``mt.locus.take(5)``. This is rare; it is much more common to manipulate the :class:`.LocusExpression` object, which is constructed using the following functions: - :func:`.locus` - :func:`.parse_locus` - :func:`.locus_from_global_position` """ @typecheck_method(contig=oneof(str, int), position=int, reference_genome=reference_genome_type) def __init__(self, contig, position, reference_genome='default'): if isinstance(contig, int): contig = str(contig) self._contig = contig self._position = position self._rg = reference_genome def __str__(self): return f'{self._contig}:{self._position}' def __repr__(self): return 'Locus(contig=%s, position=%s, reference_genome=%s)' % ( self.contig, self.position, self._rg) def __eq__(self, other): return (isinstance(other, Locus) and self._contig == other._contig and self._position == other._position and self._rg == other._rg) def __hash__(self): return hash(self._contig) ^ hash(self._position) ^ hash(self._rg) @classmethod @typecheck_method(string=str, reference_genome=reference_genome_type) def parse(cls, string, reference_genome='default'): """Parses a locus object from a CHR:POS string. **Examples** >>> l1 = hl.Locus.parse('1:101230') >>> l2 = hl.Locus.parse('X:4201230') :param str string: String to parse. :param reference_genome: Reference genome to use. Default is :func:`~hail.default_reference`. :type reference_genome: :class:`str` or :class:`.ReferenceGenome` :rtype: :class:`.Locus` """ contig, pos = string.split(':') if pos.lower() == 'end': pos = reference_genome.contig_length(contig) else: pos = int(pos) return Locus(contig, pos, reference_genome) @property def contig(self): """ Chromosome identifier. :rtype: str """ return self._contig @property def position(self): """ Chromosomal position (1-based). :rtype: int """ return self._position @property def reference_genome(self): """Reference genome. :return: :class:`.ReferenceGenome` """ return self._rg
}) def localize(mt): if isinstance(mt, MatrixTable): return mt._localize_entries('__entries', '__cols') return mt def unlocalize(mt): if isinstance(mt, Table): return mt._unlocalize_entries('__entries', '__cols', ['s']) return mt @typecheck(mt=oneof(Table, MatrixTable), info_to_keep=sequenceof(str)) def transform_gvcf(mt, info_to_keep=[]) -> Table: """Transforms a gvcf into a sparse matrix table The input to this should be some result of either :func:`.import_vcf` or :func:`.import_gvcfs` with ``array_elements_required=False``. There is an assumption that this function will be called on a matrix table with one column (or a localized table version of the same). Parameters ---------- mt : :obj:`Union[Table, MatrixTable]` The gvcf being transformed, if it is a table, then it must be a localized matrix table with the entries array named ``__entries`` info_to_keep : :obj:`List[str]`
class tstruct(HailType, Mapping): """Hail type for structured groups of heterogeneous fields. In Python, these are represented as :class:`.Struct`. Parameters ---------- field_types : keyword args of :class:`.HailType` Fields. See Also -------- :class:`.StructExpression`, :class:`.Struct` """ @typecheck_method(field_types=hail_type) def __init__(self, **field_types): self._field_types = field_types self._fields = tuple(field_types) super(tstruct, self).__init__() @property def types(self): """Struct field types. Returns ------- :obj:`tuple` of :class:`.HailType` """ return tuple(self._field_types.values()) @property def fields(self): """Struct field names. Returns ------- :obj:`tuple` of :obj:`str` Tuple of struct field names. """ return self._fields def _traverse(self, obj, f): if f(self, obj): for k, v in obj.items(): t = self[k] t._traverse(v, f) def _typecheck_one_level(self, annotation): if annotation: if isinstance(annotation, Mapping): s = set(self) for f in annotation: if f not in s: raise TypeError( "type '%s' expected fields '%s', but found fields '%s'" % (self, list(self), list(annotation))) else: raise TypeError( "type 'struct' expected type Mapping (e.g. dict or hail.utils.Struct), but found '%s'" % type(annotation)) @typecheck_method(item=oneof(int, str)) def __getitem__(self, item): if not isinstance(item, str): item = self._fields[item] return self._field_types[item] def __iter__(self): return iter(self._field_types) def __len__(self): return len(self._fields) def __str__(self): return "struct{{{}}}".format(', '.join( '{}: {}'.format(escape_parsable(f), str(t)) for f, t in self.items())) def _eq(self, other): return (isinstance(other, tstruct) and self._fields == other._fields and all(self[f] == other[f] for f in self._fields)) def _pretty(self, l, indent, increment): if not self._fields: l.append('struct {}') return pre_indent = indent indent += increment l.append('struct {') for i, (f, t) in enumerate(self.items()): if i > 0: l.append(', ') l.append('\n') l.append(' ' * indent) l.append('{}: '.format(escape_parsable(f))) t._pretty(l, indent, increment) l.append('\n') l.append(' ' * pre_indent) l.append('}') def _parsable_string(self): return "Struct{{{}}}".format(','.join( '{}:{}'.format(escape_parsable(f), t._parsable_string()) for f, t in self.items())) def _convert_from_json(self, x): from hail.utils import Struct return Struct( **{f: t._convert_from_json_na(x.get(f)) for f, t in self.items()}) def _convert_to_json(self, x): return {f: t._convert_to_json_na(x[f]) for f, t in self.items()} def _is_prefix_of(self, other): return (isinstance(other, tstruct) and len(self._fields) <= len(other._fields) and all(x == y for x, y in zip(self._field_types.values(), other._field_types.values()))) def _concat(self, other): new_field_types = {} new_field_types.update(self._field_types) new_field_types.update(other._field_types) return tstruct(**new_field_types) def _insert(self, path, t): if not path: return t key = path[0] keyt = self.get(key) if not (keyt and isinstance(keyt, tstruct)): keyt = tstruct() return self._insert_fields(**{key: keyt._insert(path[1:], t)}) def _insert_field(self, field, typ): return self._insert_fields(**{field: typ}) def _insert_fields(self, **new_fields): new_field_types = {} new_field_types.update(self._field_types) new_field_types.update(new_fields) return tstruct(**new_field_types) def _drop_fields(self, fields): return tstruct(**{f: t for f, t in self.items() if f not in fields}) def _select_fields(self, fields): return tstruct(**{f: self[f] for f in fields}) def _index_path(self, path): t = self for p in path: t = t[p] return t def _rename(self, map): seen = {} new_field_types = {} for f0, t in self.items(): f = map.get(f0, f0) if f in seen: raise ValueError( "Cannot rename two fields to the same name: attempted to rename {} and {} both to {}" .format(repr(seen[f]), repr(f0), repr(f))) else: seen[f] = f0 new_field_types[f] = t return tstruct(**new_field_types) def unify(self, t): if not (isinstance(t, tstruct) and len(self) == len(t)): return False for (f1, t1), (f2, t2) in zip(self.items(), t.items()): if not (f1 == f2 and t1.unify(t2)): return False return True def subst(self): return tstruct(**{f: t.subst() for f, t in self.items()}) def clear(self): for f, t in self.items(): t.clear() def _get_context(self): return HailTypeContext.union(*self.values())
y_contigs, mt_contigs, par)) def _init_from_java(self, jrep): self._jrep = jrep @classmethod def _from_java(cls, jrep): gr = ReferenceGenome.__new__(cls) gr._init_from_java(jrep) gr._name = None gr._contigs = None gr._lengths = None gr._x_contigs = None gr._y_contigs = None gr._mt_contigs = None gr._par = None gr._par_tuple = None super(ReferenceGenome, gr).__init__() ReferenceGenome._references[gr.name] = gr return gr def _check_locus(self, l_jrep): self._jrep.checkLocus(l_jrep) def _check_interval(self, interval_jrep): self._jrep.checkInterval(interval_jrep) reference_genome_type = oneof( transformed((str, lambda x: hl.get_reference(x))), ReferenceGenome)
import hail as hl from hail.expr.expressions import expr_float64, expr_numeric, analyze from hail.typecheck import typecheck, oneof, sequenceof, nullable from hail.table import Table from hail.matrixtable import MatrixTable from hail.utils import wrap_to_list, new_temp_file import numpy as np @typecheck(weight_expr=expr_float64, ld_score_expr=expr_numeric, chi_sq_exprs=oneof(expr_float64, sequenceof(expr_float64)), n_samples_exprs=oneof(expr_numeric, sequenceof(expr_numeric)), n_blocks=int, two_step_threshold=int, n_reference_panel_variants=nullable(int)) def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model:
start : int or :class:`.Expression` of type :py:data:`.tint32` Start of range. stop : int or :class:`.Expression` of type :py:data:`.tint32` End of range. step : int or :class:`.Expression` of type :py:data:`.tint32` Step of range. Returns ------- :class:`.NDArrayNumericExpression` A 1-dimensional ndarray from `start` to `stop` by `step`. """ return array(hl.range(start, stop, step)) @typecheck(shape=oneof(expr_int64, tupleof(expr_int64), expr_tuple()), value=expr_any, dtype=nullable(HailType)) def full(shape, value, dtype=None): """Creates a hail :class:`.NDArrayNumericExpression` full of the specified value. Examples -------- Create a 5 by 7 NDArray of type :py:data:`.tfloat64` 9s. >>> hl.nd.full((5, 7), 9) It is possible to specify a type other than :py:data:`.tfloat64` with the `dtype` argument. >>> hl.nd.full((5, 7), 9, dtype=hl.tint32)
import hail as hl from hail.linalg import BlockMatrix from hail.typecheck import typecheck, nullable, sequenceof, oneof from hail.expr.expressions import expr_float64, expr_numeric, expr_locus from hail.utils import new_temp_file, wrap_to_list @typecheck(entry_expr=expr_float64, locus_expr=expr_locus(), radius=oneof(int, float), coord_expr=nullable(expr_float64), annotation_exprs=nullable( oneof(expr_numeric, sequenceof(expr_numeric))), block_size=nullable(int)) def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> hl.Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations
""" import hail as hl from hail.expr.expressions import expr_int32, expr_int64, expr_float32, expr_float64 from hail.typecheck import typecheck, oneof, nullable from hail.matrixtable import MatrixTable import re from datetime import datetime, timedelta from hail.utils.java import Env import numpy as np import pandas as pd import os @typecheck(mt=MatrixTable, genotype=oneof(expr_int32, expr_int64, expr_float32, expr_float64), h2=oneof(nullable(float), nullable(int)), pi=oneof(float, int), is_annot_inf=bool, annot_coef_dict=nullable(dict), annot_regex=nullable(str), h2_normalize=bool, is_popstrat=bool, cov_coef_dict=nullable(dict), cov_regex=nullable(str), path_to_save=nullable(str)) def simulate_phenotypes(mt, genotype, h2=None, pi=1, is_annot_inf=False,
from functools import reduce import hail as hl from hail.expr.functions import _ndarray from hail.expr.functions import array as aarray from hail.expr.types import HailType, tfloat64, tfloat32, ttuple, tndarray from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple, expr_any, expr_array, expr_ndarray, expr_numeric, Int64Expression, cast_expr, construct_expr, expr_bool) from hail.expr.expressions.typed_expressions import NDArrayNumericExpression from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat, NDArraySVD, Apply tsequenceof_nd = oneof(sequenceof(expr_ndarray()), expr_array(expr_ndarray())) shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple()) def array(input_array, dtype=None): """Construct an :class:`.NDArrayExpression` Examples -------- >>> hl.eval(hl.nd.array([1, 2, 3, 4])) array([1, 2, 3, 4], dtype=int32) >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]])) array([[1, 2, 3], [4, 5, 6]], dtype=int32)
require_col_key_str(dataset, 'rename_duplicates') ids = dataset.col_key[0].collect() mapping, new_ids = deduplicate(ids) if mapping: info( f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:' + ''.join(f'\n "{pre}" => "{post}"' for pre, post in mapping)) else: info('No duplicate sample IDs found.') return dataset.annotate_cols( **{name: hl.literal(new_ids)[hl.int(hl.scan.count())]}) @typecheck(ds=oneof(Table, MatrixTable), intervals=expr_array(expr_interval(expr_any)), keep=bool) def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
import numpy as np import hail as hl from hail.typecheck import typecheck, oneof, nullable from hail.expr.expressions import expr_locus, expr_float64, check_row_indexed from hail.utils.java import Env @typecheck(a=np.ndarray, radius=oneof(int, float)) def array_windows(a, radius): """Returns start and stop indices for window around each array value. Examples -------- >>> hl.linalg.utils.array_windows(np.array([1, 2, 4, 4, 6, 8]), 2) (array([0, 0, 1, 1, 2, 4]), array([2, 4, 5, 5, 6, 6])) >>> hl.linalg.utils.array_windows(np.array([-10.0, -2.5, 0.0, 0.0, 1.2, 2.3, 3.0]), 2.5) (array([0, 1, 1, 1, 2, 2, 4]), array([1, 4, 6, 6, 7, 7, 7])) Notes ----- For an array ``a`` in ascending order, the resulting ``starts`` and ``stops`` arrays have the same length as ``a`` and the property that, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal range of indices ``j`` such that ``a[i] - radius <= a[j] <= a[i] + radius``. Index ranges are start-inclusive and stop-exclusive. This function is especially useful in conjunction with :meth:`.BlockMatrix.sparsify_row_intervals`.
import json import re from hail.typecheck import * from hail.utils import wrap_to_list from hail.utils.java import jiterable_to_list, Env, joption from hail.typecheck import oneof, transformed import hail as hl rg_type = lazy() reference_genome_type = oneof(transformed((str, lambda x: hl.get_reference(x))), rg_type) class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37 - GRCh38 - GRCm38
from functools import reduce import hail as hl from hail.expr.functions import _ndarray from hail.expr.functions import array as aarray from hail.expr.types import HailType, tfloat64, ttuple, tndarray from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple, expr_any, expr_array, expr_ndarray, expr_numeric, Int64Expression, cast_expr, construct_expr) from hail.expr.expressions.typed_expressions import NDArrayNumericExpression from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat, NDArraySVD, Apply tsequenceof_nd = oneof(sequenceof(expr_ndarray()), expr_array(expr_ndarray())) shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple()) def array(input_array, dtype=None): """Construct an :class:`.NDArrayExpression` Examples -------- >>> hl.eval(hl.nd.array([1, 2, 3, 4])) array([1, 2, 3, 4], dtype=int32) >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]])) array([[1, 2, 3], [4, 5, 6]], dtype=int32)
ldsc simulation framework @author: nbaya """ import hail as hl from hail.expr.expressions import expr_int32, expr_int64, expr_float32, expr_float64 from hail.typecheck import typecheck, oneof, nullable from hail.matrixtable import MatrixTable import re from datetime import datetime, timedelta @typecheck(mt=MatrixTable, genotype=oneof(expr_int32, expr_int64, expr_float32, expr_float64), h2=oneof(nullable(float), nullable(int)), pi=oneof(float,int), is_annot_inf=bool, annot_coef_dict=nullable(dict), annot_regex=nullable(str), h2_normalize=bool, is_popstrat=bool, cov_coef_dict=nullable(dict), cov_regex=nullable(str), path_to_save=nullable(str)) def simulate_phenotypes(mt, genotype, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None, annot_regex=None,h2_normalize=True, is_popstrat=False, cov_coef_dict=None, cov_regex=None, path_to_save=None):
class LinearMixedModel(object): r"""Class representing a linear mixed model. .. include:: ../_templates/experimental.rst :class:`LinearMixedModel` represents a linear model of the form .. math:: y \sim \mathrm{N}(X \beta, \, \sigma^2 K + \tau^2 I) where - :math:`\mathrm{N}` is a :math:`n`-dimensional normal distribution. - :math:`y` is a known vector of :math:`n` observations. - :math:`X` is a known :math:`n \times p` design matrix for :math:`p` fixed effects. - :math:`K` is a known :math:`n \times n` positive semi-definite kernel. - :math:`I` is the :math:`n \times n` identity matrix. - :math:`\beta` is a :math:`p`-parameter vector of fixed effects. - :math:`\sigma^2` is the variance parameter on :math:`K`. - :math:`\tau^2` is the variance parameter on :math:`I`. In particular, the residuals for the :math:`i^\mathit{th}` and :math:`j^\mathit{th}` observations have covariance :math:`\sigma^2 K_{ij}` for :math:`i \neq j`. This model is equivalent to a `mixed model <https://en.wikipedia.org/wiki/Mixed_model>`__ of the form .. math:: y = X \beta + Z u + \epsilon by setting :math:`K = ZZ^T` where - :math:`Z` is a known :math:`n \times r` design matrix for :math:`r` random effects. - :math:`u` is a :math:`r`-vector of random effects drawn from :math:`\mathrm{N}(0, \sigma^2 I)`. - :math:`\epsilon` is a :math:`n`-vector of random errors drawn from :math:`\mathrm{N}(0, \tau^2 I)`. However, :class:`LinearMixedModel` does not itself realize :math:`K` as a linear kernel with respect to random effects, nor does it take :math:`K` explicitly as input. Rather, via the eigendecomposion :math:`K = U S U^T`, the the class leverages a third, decorrelated form of the model .. math:: Py \sim \mathrm{N}(PX \beta, \, \sigma^2 (\gamma S + I)) where - :math:`P = U^T: \mathbb{R}^n \rightarrow \mathbb{R}^n` is an orthonormal transformation that decorrelates the observations. The rows of :math:`P` are an eigenbasis for :math:`K`. - :math:`S` is the :math:`n \times n` diagonal matrix of corresponding eigenvalues. - :math:`\gamma = \frac{\sigma^2}{\tau^2}` is the ratio of variance parameters. Hence, the triple :math:`(Py, PX, S)` determines the probability of the observations for any choice of model parameters, and is therefore sufficient for inference. This triple, with S encoded as a vector, is the default ("full-rank") initialization of the class. :class:`LinearMixedModel` also provides an efficient strategy to fit the model above with :math:`K` replaced by its rank-:math:`r` approximation :math:`K_r = P_r^T S_r P_r` where - :math:`P_r: \mathbb{R}^n \rightarrow \mathbb{R}^r` has orthonormal rows consisting of the top :math:`r` eigenvectors of :math:`K`. - :math:`S_r` is the :math:`r \times r` diagonal matrix of corresponding non-zero eigenvalues. For this low-rank model, the quintuple :math:`(P_r y, P_r X, S_r, y, X)` is similarly sufficient for inference and corresponds to the "low-rank" initialization of the class. Morally, :math:`y` and :math:`X` are required for low-rank inference because the diagonal :math:`\gamma S + I` is always full-rank. If :math:`K` actually has rank :math:`r`, then :math:`K = K_r` and the low-rank and full-rank models are equivalent. Hence low-rank inference provides a more efficient, equally-exact algorithm for fitting the full-rank model. This situation arises, for example, when :math:`K` is the linear kernel of a mixed model with fewer random effects than observations. Even when :math:`K` has full rank, using a lower-rank approximation may be an effective from of regularization, in addition to boosting computational efficiency. **Initialization** The class may be initialized directly or with one of two methods: - :meth:`from_kinship` takes :math:`y`, :math:`X`, and :math:`K` as ndarrays. The model is always full-rank. - :meth:`from_random_effects` takes :math:`y` and :math:`X` as ndarrays and :math:`Z` as an ndarray or block matrix. The model is full-rank if and only if :math:`n \leq m`. Direct full-rank initialization takes :math:`Py`, :math:`PX`, and :math:`S` as ndarrays. The following class attributes are set: .. list-table:: :header-rows: 1 * - Attribute - Type - Value * - `low_rank` - bool - ``False`` * - `n` - int - Number of observations :math:`n` * - `f` - int - Number of fixed effects :math:`p` * - `r` - int - Effective number of random effects, must equal :math:`n` * - `py` - ndarray - Rotated response vector :math:`P y` with shape :math:`(n)` * - `px` - ndarray - Rotated design matrix :math:`P X` with shape :math:`(n, p)` * - `s` - ndarray - Eigenvalues vector :math:`S` of :math:`K` with shape :math:`(n)` * - `p_path` - str - Path at which :math:`P` is stored as a block matrix Direct low-rank initialization takes :math:`P_r y`, :math:`P_r X`, :math:`S_r`, :math:`y`, and :math:`X` as ndarrays. The following class attributes are set: .. list-table:: :header-rows: 1 * - Attribute - Type - Value * - `low_rank` - bool - ``True`` * - `n` - int - Number of observations :math:`n` * - `f` - int - Number of fixed effects :math:`p` * - `r` - int - Effective number of random effects, must be less than :math:`n` * - `py` - ndarray - Projected response vector :math:`P_r y` with shape :math:`(r)` * - `px` - ndarray - Projected design matrix :math:`P_r X` with shape :math:`(r, p)` * - `s` - ndarray - Eigenvalues vector :math:`S_r` of :math:`K_r` with shape :math:`(r)` * - `y` - ndarray - Response vector with shape :math:`(n)` * - `x` - ndarray - Design matrix with shape :math:`(n, p)` * - `p_path` - str - Path at which :math:`P` is stored as a block matrix **Fitting the model** :meth:`fit` uses `restricted maximum likelihood <https://en.wikipedia.org/wiki/Restricted_maximum_likelihood>`__ (REML) to estimate :math:`(\beta, \sigma^2, \tau^2)`. This is done by numerical optimization of the univariate function :meth:`compute_neg_log_reml`, which itself optimizes REML constrained to a fixed ratio of variance parameters. Each evaluation of :meth:`compute_neg_log_reml` has computational complexity .. math:: \mathit{O}(rp^2 + p^3). :meth:`fit` adds the following attributes at this estimate. .. list-table:: :header-rows: 1 * - Attribute - Type - Value * - `beta` - ndarray - :math:`\beta` * - `sigma_sq` - float - :math:`\sigma^2` * - `tau_sq` - float - :math:`\tau^2` * - `gamma` - float - :math:`\gamma = \frac{\sigma^2}{\tau^2}` * - `log_gamma` - float - :math:`\log{\gamma}` * - `h_sq` - float - :math:`\mathit{h}^2 = \frac{\sigma^2}{\sigma^2 + \tau^2}` * - `h_sq_standard_error` - float - asymptotic estimate of :math:`\mathit{h}^2` standard error **Testing alternative models** The model is also equivalent to its augmentation .. math:: y \sim \mathrm{N}\left(x_\star\beta_\star + X \beta, \, \sigma^2 K + \tau^2 I\right) by an additional covariate of interest :math:`x_\star` under the null hypothesis that the corresponding fixed effect parameter :math:`\beta_\star` is zero. Similarly to initialization, full-rank testing of the alternative hypothesis :math:`\beta_\star \neq 0` requires :math:`P x_\star`, whereas the low-rank testing requires :math:`P_r x_\star` and :math:`x_\star`. After running :meth:`fit` to fit the null model, one can test each of a collection of alternatives using either of two implementations of the likelihood ratio test: - :meth:`fit_alternatives_numpy` takes one or two ndarrays. It is a pure Python method that evaluates alternatives serially on master. - :meth:`fit_alternatives` takes one or two paths to block matrices. It evaluates alternatives in parallel on the workers. Per alternative, both have computational complexity .. math:: \mathit{O}(rp + p^3). Parameters ---------- py: :class:`ndarray` Projected response vector :math:`P_r y` with shape :math:`(r)`. px: :class:`ndarray` Projected design matrix :math:`P_r X` with shape :math:`(r, p)`. s: :class:`ndarray` Eigenvalues vector :math:`S` with shape :math:`(r)`. y: :class:`ndarray`, optional Response vector with shape :math:`(n)`. Include for low-rank inference. x: :class:`ndarray`, optional Design matrix with shape :math:`(n, p)`. Include for low-rank inference. p_path: :obj:`str`, optional Path at which :math:`P` has been stored as a block matrix. """ @typecheck_method(py=np.ndarray, px=np.ndarray, s=np.ndarray, y=nullable(np.ndarray), x=nullable(np.ndarray), p_path=nullable(str)) def __init__(self, py, px, s, y=None, x=None, p_path=None): if y is None and x is None: low_rank = False elif y is not None and x is not None: low_rank = True else: raise ValueError( 'for low-rank, set both y and x; for full-rank, do not set y or x.' ) _check_dims(py, 'py', 1) _check_dims(px, 'px', 2) _check_dims(s, 's', 1) r = s.size f = px.shape[1] if py.size != r: raise ValueError("py and s must have the same size") if px.shape[0] != r: raise ValueError( "px must have the same number of rows as the size of s") if low_rank: _check_dims(y, 'y', 1) _check_dims(x, 'x', 2) n = y.size if n <= r: raise ValueError("size of y must be larger than the size of s") if x.shape[0] != n: raise ValueError( "x must have the same number of rows as the size of y") if x.shape[1] != f: raise ValueError("px and x must have the same number columns") else: n = r if p_path is not None: n_rows, n_cols = BlockMatrix.read(p_path).shape if n_cols != n: raise ValueError( "LinearMixedModel: Number of columns in the block " f"matrix at 'p_path' ({n_cols}) must equal " f"the size of 'y' ({n})") if n_rows != r: raise ValueError( "LinearMixedModel: Number of rows in the block " f"matrix at 'p_path' ({n_rows}) must equal " f"the size of 'py' ({r})") self.low_rank = low_rank self.n = n self.f = f self.r = r self.py = py self.px = px self.s = s self.y = y self.x = x self.p_path = p_path self._check_dof() self.beta = None self.sigma_sq = None self.tau_sq = None self.gamma = None self.log_gamma = None self.h_sq = None self.h_sq_standard_error = None self.optimize_result = None self._fitted = False if low_rank: self._yty = y @ y self._xty = x.T @ y self._xtx = x.T @ x self._dof = n - f self._d = None self._ydy = None self._xdy = None self._xdx = None self._dof_alt = n - (f + 1) self._d_alt = None self._ydy_alt = None self._xdy_alt = np.zeros(f + 1) self._xdx_alt = np.zeros((f + 1, f + 1)) self._residual_sq = None self._scala_model = None def _reset(self): self._fitted = False self.beta = None self.sigma_sq = None self.tau_sq = None self.gamma = None self.log_gamma = None self.h_sq = None self.h_sq_standard_error = None self.optimize_result = None def compute_neg_log_reml(self, log_gamma, return_parameters=False): r"""Compute negative log REML constrained to a fixed value of :math:`\log{\gamma}`. This function computes the triple :math:`(\beta, \sigma^2, \tau^2)` with :math:`\gamma = \frac{\sigma^2}{\tau^2}` at which the restricted likelihood is maximized and returns the negative of the restricted log likelihood at these parameters (shifted by the constant defined below). The implementation has complexity :math:`\mathit{O}(rp^2 + p^3)` and is inspired by `FaST linear mixed models for genome-wide association studies (2011) <https://www.nature.com/articles/nmeth.1681>`__. The formulae follow from `Bayesian Inference for Variance Components Using Only Error Contrasts (1974) <http://faculty.dbmi.pitt.edu/day/Bioinf2132-advanced-Bayes-and-R/previousDocuments/Bioinf2132-documents-2016/2016-11-22/Harville-1974.pdf>`__. Harville derives that for fixed covariance :math:`V`, the restricted likelihood of the variance parameter :math:`V` in the model .. math:: y \sim \mathrm{N}(X \beta, \, V) is given by .. math:: (2\pi)^{-\frac{1}{2}(n - p)} \det(X^T X)^\frac{1}{2} \det(V)^{-\frac{1}{2}} \det(X^T V^{-1} X)^{-\frac{1}{2}} e^{-\frac{1}{2}(y - X\hat\beta)^T V^{-1}(y - X\hat\beta)}. with .. math:: \hat\beta = (X^T V^{-1} X)^{-1} X^T V^{-1} y. In our case, the variance is .. math:: V = \sigma^2 K + \tau^2 I = \sigma^2 (K + \gamma^{-1} I) which is determined up to scale by any fixed value of the ratio :math:`\gamma`. So for input :math:`\log \gamma`, the negative restricted log likelihood is minimized at :math:`(\hat\beta, \hat\sigma^2)` with :math:`\hat\beta` as above and .. math:: \hat\sigma^2 = \frac{1}{n - p}(y - X\hat\beta)^T (K + \gamma^{-1} I)^{-1}(y - X\hat\beta). For :math:`\hat V` at this :math:`(\hat\beta, \hat\sigma^2, \gamma)`, the exponent in the likelihood reduces to :math:`-\frac{1}{2}(n-p)`, so the negative restricted log likelihood may be expressed as .. math:: \frac{1}{2}\left(\log \det(\hat V) + \log\det(X^T \hat V^{-1} X)\right) + C where .. math:: C = \frac{1}{2}\left(n - p + (n - p)\log(2\pi) - \log\det(X^T X)\right) only depends on :math:`X`. :meth:`compute_neg_log_reml` returns the value of the first term, omitting the constant term. Parameters ---------- log_gamma: :obj:`float` Value of :math:`\log{\gamma}`. return_parameters: If ``True``, also return :math:`\beta`, :math:`\sigma^2`, and :math:`\tau^2`. Returns ------- :obj:`float` or (:obj:`float`, :class:`ndarray`, :obj:`float`, :obj:`float`) If `return_parameters` is ``False``, returns (shifted) negative log REML. Otherwise, returns (shifted) negative log REML, :math:`\beta`, :math:`\sigma^2`, and :math:`\tau^2`. """ from scipy.linalg import solve, LinAlgError gamma = np.exp(log_gamma) d = 1 / (self.s + 1 / gamma) logdet_d = np.sum(np.log(d)) + (self.n - self.r) * log_gamma if self.low_rank: d -= gamma dpy = d * self.py ydy = self.py @ dpy + gamma * self._yty xdy = self.px.T @ dpy + gamma * self._xty xdx = (self.px.T * d) @ self.px + gamma * self._xtx else: dpy = d * self.py ydy = self.py @ dpy xdy = self.px.T @ dpy xdx = (self.px.T * d) @ self.px try: beta = solve(xdx, xdy, assume_a='pos') residual_sq = ydy - xdy.T @ beta sigma_sq = residual_sq / self._dof tau_sq = sigma_sq / gamma neg_log_reml = (np.linalg.slogdet(xdx)[1] - logdet_d + self._dof * np.log(sigma_sq)) / 2 self._d, self._ydy, self._xdy, self._xdx = d, ydy, xdy, xdx # used in fit if return_parameters: return neg_log_reml, beta, sigma_sq, tau_sq else: return neg_log_reml except LinAlgError as e: raise Exception( 'linear algebra error while solving for REML estimate') from e @typecheck_method(log_gamma=nullable(numeric), bounds=tupleof(numeric), tol=float, maxiter=int) def fit(self, log_gamma=None, bounds=(-8.0, 8.0), tol=1e-8, maxiter=500): r"""Find the triple :math:`(\beta, \sigma^2, \tau^2)` maximizing REML. This method sets the attributes `beta`, `sigma_sq`, `tau_sq`, `gamma`, `log_gamma`, `h_sq`, and `h_sq_standard_error` as described in the top-level class documentation. If `log_gamma` is provided, :meth:`fit` finds the REML solution with :math:`\log{\gamma}` constrained to this value. In this case, `h_sq_standard_error` is ``None`` since `h_sq` is not estimated. Otherwise, :meth:`fit` searches for the value of :math:`\log{\gamma}` that minimizes :meth:`compute_neg_log_reml`, and also sets the attribute `optimize_result` of type `scipy.optimize.OptimizeResult <https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html>`__. Parameters ---------- log_gamma: :obj:`float`, optional If provided, the solution is constrained to have this value of :math:`\log{\gamma}`. bounds: :obj:`float`, :obj:`float` Lower and upper bounds for :math:`\log{\gamma}`. tol: :obj:`float` Absolute tolerance for optimizing :math:`\log{\gamma}`. maxiter: :obj:`float` Maximum number of iterations for optimizing :math:`\log{\gamma}`. """ if self._fitted: self._reset() fit_log_gamma = True if log_gamma is None else False if fit_log_gamma: from scipy.optimize import minimize_scalar self.optimize_result = minimize_scalar(self.compute_neg_log_reml, method='bounded', bounds=bounds, options={ 'xatol': tol, 'maxiter': maxiter }) if self.optimize_result.success: if self.optimize_result.x - bounds[0] < 0.001: raise Exception( "failed to fit log_gamma: optimum within 0.001 of lower bound." ) elif bounds[1] - self.optimize_result.x < 0.001: raise Exception( "failed to fit log_gamma: optimum within 0.001 of upper bound." ) else: self.log_gamma = self.optimize_result.x else: raise Exception( f'failed to fit log_gamma:\n {self.optimize_result}') else: self.log_gamma = log_gamma _, self.beta, self.sigma_sq, self.tau_sq = self.compute_neg_log_reml( self.log_gamma, return_parameters=True) self.gamma = np.exp(self.log_gamma) self.h_sq = self.sigma_sq / (self.sigma_sq + self.tau_sq) self._residual_sq = self.sigma_sq * self._dof self._d_alt = self._d self._ydy_alt = self._ydy self._xdy_alt[1:] = self._xdy self._xdx_alt[1:, 1:] = self._xdx if fit_log_gamma: self.h_sq_standard_error = self._estimate_h_sq_standard_error() self._fitted = True def _estimate_h_sq_standard_error(self): epsilon = 1e-4 # parabolic interpolation radius in log_gamma space lg = self.log_gamma + np.array([-epsilon, 0.0, epsilon]) h2 = 1 / (1 + np.exp(-lg)) nll = [self.compute_neg_log_reml(lgi) for lgi in lg] if nll[1] > nll[0] or nll[1] > nll[2]: i = 0 if nll[1] > nll[0] else 2 raise Exception( f'Minimum of negative log likelihood fit as {nll[1]} at log_gamma={lg[1]},' f'\n but found smaller value of {nll[i]} at log_gamma={lg[i]}.' f'\n Investigate by plotting the negative log likelihood function.' ) # Asymptotically near MLE, nLL = a * h2^2 + b * h2 + c with a = 1 / (2 * se^2) # By Lagrange interpolation: a = ((h2[2] * (nll[1] - nll[0]) + h2[1] * (nll[0] - nll[2]) + h2[0] * (nll[2] - nll[1])) / ((h2[1] - h2[0]) * (h2[0] - h2[2]) * (h2[2] - h2[1]))) return 1 / np.sqrt(2 * a) def h_sq_normalized_lkhd(self): r"""Estimate the normalized likelihood of :math:`\mathit{h}^2` over the discrete grid of percentiles. Examples -------- Plot the estimated normalized likelihood function: >>> import matplotlib.pyplot as plt # doctest: +SKIP >>> plt.plot(range(101), model.h_sq_normalized_lkhd()) # doctest: +SKIP Notes ----- This method may be used to visualize the approximate posterior on :math:`\mathit{h}^2` under a flat prior. The resulting ndarray ``a`` has length 101 with ``a[i]`` equal to the maximum likelihood over all :math:`\beta` and :math:`\sigma^2` with :math:`\mathit{h}^2` constrained to ``i / 100``. The values for ``1 <= i <= 99`` are normalized to sum to 1, and ``a[0]`` and ``a[100]`` are set to ``nan``. Returns ------- :class:`ndarray` of :obj:`float64` Normalized likelihood values for :math:`\mathit{h}^2`. """ log_lkhd = np.zeros(101, dtype=np.float64) log_lkhd[0], log_lkhd[100] = np.nan, np.nan for h2 in range(1, 100): gamma = h2 / (100.0 - h2) log_lkhd[h2] = -self.compute_neg_log_reml(np.log(gamma)) log_lkhd -= np.max(log_lkhd[1:-1]) lkhd = np.exp(log_lkhd) lkhd /= np.sum(lkhd[1:-1]) return lkhd @typecheck_method(pa_t_path=str, a_t_path=nullable(str), partition_size=nullable(int)) def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None): r"""Fit and test alternative model for each augmented design matrix in parallel. Notes ----- The alternative model is fit using REML constrained to the value of :math:`\gamma` set by :meth:`fit`. The likelihood ratio test of fixed effect parameter :math:`\beta_\star` uses (non-restricted) maximum likelihood: .. math:: \chi^2 = 2 \log\left(\frac{ \max_{\beta_\star, \beta, \sigma^2}\mathrm{N} (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)} {\max_{\beta, \sigma^2} \mathrm{N} (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)} \right) The p-value is given by the tail probability under a chi-squared distribution with one degree of freedom. The resulting table has the following fields: .. list-table:: :header-rows: 1 * - Field - Type - Value * - `idx` - int64 - Index of augmented design matrix. * - `beta` - float64 - :math:`\beta_\star` * - `sigma_sq` - float64 - :math:`\sigma^2` * - `chi_sq` - float64 - :math:`\chi^2` * - `p_value` - float64 - p-value :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number of rows (augmentations). These rows are grouped into partitions for parallel processing. The number of partitions equals the ceiling of ``n_rows / partition_size``, and should be at least the number or cores to make use of all cores. By default, there is one partition per row of blocks in :math:`(P_r A)^T`. Setting the partition size to an exact (rather than approximate) divisor or multiple of the block size reduces superfluous shuffling of data. The number of columns in each block matrix must be less than :math:`2^{31}`. Warning ------- The block matrices must be stored in row-major format, as results from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method will produce an error message. Parameters ---------- pa_t_path: :obj:`str` Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`. Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`. a_t_path: :obj:`str`, optional Path to block matrix :math:`A^T` with shape :math:`(m, n)`. Each row is an augmentation :math:`x_\star` of :math:`X`. Include for low-rank inference. partition_size: :obj:`int`, optional Number of rows to process per partition. Default given by block size of :math:`(P_r A)^T`. Returns ------- :class:`.Table` Table of results for each augmented design matrix. """ from hail.table import Table self._check_dof(self.f + 1) if self.low_rank and a_t_path is None: raise ValueError('model is low-rank so a_t is required.') elif not (self.low_rank or a_t_path is None): raise ValueError('model is full-rank so a_t must not be set.') if self._scala_model is None: self._set_scala_model() backend = Env.spark_backend('LinearMixedModel.fit_alternatives') jfs = backend.fs._jfs if partition_size is None: block_size = Env.hail().linalg.BlockMatrix.readMetadata( jfs, pa_t_path).blockSize() partition_size = block_size elif partition_size <= 0: raise ValueError( f'partition_size must be positive, found {partition_size}') jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix( jfs, pa_t_path, partition_size) if a_t_path is None: maybe_ja_t = None else: maybe_ja_t = Env.hail().linalg.RowMatrix.readBlockMatrix( jfs, a_t_path, partition_size) return Table._from_java( backend._jbackend.pyFitLinearMixedModel(self._scala_model, jpa_t, maybe_ja_t)) @typecheck_method(pa=np.ndarray, a=nullable(np.ndarray), return_pandas=bool) def fit_alternatives_numpy(self, pa, a=None, return_pandas=False): r"""Fit and test alternative model for each augmented design matrix. Notes ----- This Python-only implementation runs serially on master. See the scalable implementation :meth:`fit_alternatives` for documentation of the returned table. Parameters ---------- pa: :class:`ndarray` Projected matrix :math:`P_r A` of alternatives with shape :math:`(r, m)`. Each column is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`. a: :class:`ndarray`, optional Matrix :math:`A` of alternatives with shape :math:`(n, m)`. Each column is an augmentation :math:`x_\star` of :math:`X`. Required for low-rank inference. return_pandas: :obj:`bool` If true, return pandas dataframe. If false, return Hail table. Returns ------- :class:`.Table` or :class:`.pandas.DataFrame` Table of results for each augmented design matrix. """ self._check_dof(self.f + 1) if not self._fitted: raise Exception("null model is not fit. Run 'fit' first.") n_cols = pa.shape[1] assert pa.shape[0] == self.r if self.low_rank: assert a.shape[0] == self.n and a.shape[1] == n_cols data = [(i, ) + self._fit_alternative_numpy(pa[:, i], a[:, i]) for i in range(n_cols)] else: data = [(i, ) + self._fit_alternative_numpy(pa[:, i], None) for i in range(n_cols)] df = pd.DataFrame.from_records( data, columns=['idx', 'beta', 'sigma_sq', 'chi_sq', 'p_value']) if return_pandas: return df else: return Table.from_pandas(df, key='idx') def _fit_alternative_numpy(self, pa, a): from scipy.linalg import solve, LinAlgError from scipy.stats.distributions import chi2 gamma = self.gamma dpa = self._d_alt * pa # single thread => no need to copy ydy = self._ydy_alt xdy = self._xdy_alt xdx = self._xdx_alt if self.low_rank: xdy[0] = self.py @ dpa + gamma * (self.y @ a) xdx[0, 0] = pa @ dpa + gamma * (a @ a) xdx[0, 1:] = self.px.T @ dpa + gamma * (self.x.T @ a) else: xdy[0] = self.py @ dpa xdx[0, 0] = pa @ dpa xdx[0, 1:] = self.px.T @ dpa try: beta = solve(xdx, xdy, assume_a='pos') # only uses upper triangle residual_sq = ydy - xdy.T @ beta sigma_sq = residual_sq / self._dof_alt chi_sq = self.n * np.log( self._residual_sq / residual_sq) # division => precision p_value = chi2.sf(chi_sq, 1) return beta[0], sigma_sq, chi_sq, p_value except LinAlgError: return tuple(4 * [float('nan')]) def _set_scala_model(self): from hail.utils.java import Env from hail.linalg import _jarray_from_ndarray, _breeze_from_ndarray if not self._fitted: raise Exception("null model is not fit. Run 'fit' first.") self._scala_model = Env.hail().stats.LinearMixedModel.pyApply( self.gamma, self._residual_sq, _jarray_from_ndarray(self.py), _breeze_from_ndarray(self.px), _jarray_from_ndarray(self._d_alt), self._ydy_alt, _jarray_from_ndarray(self._xdy_alt), _breeze_from_ndarray(self._xdx_alt), _jarray_from_ndarray(self.y) if self.low_rank else None, _breeze_from_ndarray(self.x) if self.low_rank else None) def _check_dof(self, f=None): if f is None: f = self.f dof = self.n - f if dof <= 0: raise ValueError( f"{self.n} {plural('observation', self.n)} with {f} fixed {plural('effect', f)} " f"implies {dof} {plural('degree', dof)} of freedom. Must be positive." ) @classmethod @typecheck_method(y=np.ndarray, x=np.ndarray, k=np.ndarray, p_path=nullable(str), overwrite=bool) def from_kinship(cls, y, x, k, p_path=None, overwrite=False): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`K`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> k = np.array([[ 1. , -0.8727875 , 0.96397335, 0.94512946], ... [-0.8727875 , 1. , -0.93036112, -0.97320323], ... [ 0.96397335, -0.93036112, 1. , 0.98294169], ... [ 0.94512946, -0.97320323, 0.98294169, 1. ]]) >>> model, p = LinearMixedModel.from_kinship(y, x, k) >>> model.fit() >>> model.h_sq # doctest: +SKIP_OUTPUT_CHECK 0.2525148830695317 >>> model.s # doctest: +SKIP_OUTPUT_CHECK array([3.83501295, 0.13540343, 0.02454114, 0.00504248]) Truncate to a rank :math:`r=2` model: >>> r = 2 >>> s_r = model.s[:r] >>> p_r = p[:r, :] >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) >>> model.fit() >>> model.h_sq # doctest: +SKIP_OUTPUT_CHECK 0.25193197591429695 Notes ----- This method eigendecomposes :math:`K = P^T S P` on the master and returns ``LinearMixedModel(p @ y, p @ x, s)`` and ``p``. The performance of eigendecomposition depends critically on the number of master cores and the NumPy / SciPy configuration, viewable with ``np.show_config()``. For Intel machines, we recommend installing the `MKL <https://anaconda.org/anaconda/mkl>`__ package for Anaconda. `k` must be positive semi-definite; symmetry is not checked as only the lower triangle is used. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects. k: :class:`ndarray` :math:`n \times n` positive semi-definite kernel :math:`K`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`K`. p: :class:`ndarray` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. """ _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(k, "k", 2) n = k.shape[0] if k.shape[1] != n: raise ValueError("from_kinship: 'k' must be a square matrix") if y.shape[0] != n: raise ValueError("from_kinship: 'y' and 'k' must have the same " "number of rows") if x.shape[0] != n: raise ValueError("from_kinship: 'x' and 'k' must have the same " "number of rows") s, u = hl.linalg._eigh(k) if s[0] < -1e12 * s[-1]: raise Exception("from_kinship: smallest eigenvalue of 'k' is" f"negative: {s[0]}") # flip singular values to descending order s = np.flip(s, axis=0) u = np.fliplr(u) p = u.T if p_path: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) model = LinearMixedModel(p @ y, p @ x, s, p_path=p_path) return model, p @classmethod @typecheck_method(y=np.ndarray, x=np.ndarray, z=oneof(np.ndarray, hl.linalg.BlockMatrix), p_path=nullable(str), overwrite=bool, max_condition_number=float, complexity_bound=int) def from_random_effects(cls, y, x, z, p_path=None, overwrite=False, max_condition_number=1e-10, complexity_bound=8192): r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`. Examples -------- >>> from hail.stats import LinearMixedModel >>> y = np.array([0.0, 1.0, 8.0, 9.0]) >>> x = np.array([[1.0, 0.0], ... [1.0, 2.0], ... [1.0, 1.0], ... [1.0, 4.0]]) >>> z = np.array([[0.0, 0.0, 1.0], ... [0.0, 1.0, 2.0], ... [1.0, 2.0, 4.0], ... [2.0, 4.0, 8.0]]) >>> model, p = LinearMixedModel.from_random_effects(y, x, z) >>> model.fit() >>> model.h_sq # doctest: +SKIP_OUTPUT_CHECK 0.38205307244271675 Notes ----- If :math:`n \leq m`, the returned model is full rank. If :math:`n > m`, the returned model is low rank. In this case only, eigenvalues less than or equal to `max_condition_number` times the top eigenvalue are dropped from :math:`S`, with the corresponding eigenvectors dropped from :math:`P`. This guards against precision loss on left eigenvectors computed via the right gramian :math:`Z^T Z` in :meth:`BlockMatrix.svd`. In either case, one can truncate to a rank :math:`r` model as follows. If `p` is an ndarray: >>> p_r = p[:r, :] # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x) # doctest: +SKIP If `p` is a block matrix: >>> p[:r, :].write(p_r_path) # doctest: +SKIP >>> p_r = BlockMatrix.read(p_r_path) # doctest: +SKIP >>> s_r = model.s[:r] # doctest: +SKIP >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path) # doctest: +SKIP This method applies no standardization to `z`. Warning ------- If `z` is a block matrix, then ideally `z` should be the result of directly reading from disk (and possibly a transpose). This is most critical if :math:`n > m`, because in this case multiplication by `z` will result in all preceding transformations being repeated ``n / block_size`` times, as explained in :class:`.BlockMatrix`. At least one dimension must be less than or equal to 46300. See the warning in :meth:`.BlockMatrix.svd` for performance considerations. Parameters ---------- y: :class:`ndarray` :math:`n` vector of observations :math:`y`. x: :class:`ndarray` :math:`n \times p` matrix of fixed effects :math:`X`. z: :class:`ndarray` or :class:`BlockMatrix` :math:`n \times m` matrix of random effects :math:`Z`. p_path: :obj:`str`, optional Path at which to write :math:`P` as a block matrix. Required if `z` is a block matrix. overwrite: :obj:`bool` If ``True``, overwrite an existing file at `p_path`. max_condition_number: :obj:`float` Maximum condition number. Must be greater than 1e-16. complexity_bound: :obj:`int` Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block matrix. Returns ------- model: :class:`LinearMixedModel` Model constructed from :math:`y`, :math:`X`, and :math:`Z`. p: :class:`ndarray` or :class:`.BlockMatrix` Matrix :math:`P` whose rows are the eigenvectors of :math:`K`. The type is block matrix if `z` is a block matrix and :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix. """ z_is_bm = isinstance(z, BlockMatrix) if z_is_bm and p_path is None: raise ValueError("from_random_effects: 'p_path' required when 'z'" "is a block matrix.") if max_condition_number < 1e-16: raise ValueError( "from_random_effects: 'max_condition_number' must " f"be at least 1e-16, found {max_condition_number}") _check_dims(y, "y", 1) _check_dims(x, "x", 2) _check_dims(z, "z", 2) n, m = z.shape if y.shape[0] != n: raise ValueError("from_random_effects: 'y' and 'z' must have the " "same number of rows") if x.shape[0] != n: raise ValueError("from_random_effects: 'x' and 'z' must have the " "same number of rows") if z_is_bm: u, s0, _ = z.svd(complexity_bound=complexity_bound) p = u.T p_is_bm = isinstance(p, BlockMatrix) else: u, s0, _ = hl.linalg._svd(z, full_matrices=False) p = u.T p_is_bm = False s = s0**2 low_rank = n > m if low_rank: assert np.all(np.isfinite(s)) r = int(np.searchsorted(-s, -max_condition_number * s[0])) if r < m: info( f'from_random_effects: model rank reduced from {m} to {r} ' f'due to ill-condition.' f'\n Largest dropped eigenvalue was {s[r]}.') s = s[:r] p = p[:r, :] if p_path is not None: if p_is_bm: p.write(p_path, overwrite=overwrite) p = BlockMatrix.read(p_path) else: BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite) if p_is_bm: py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), ( p @ x).to_numpy() else: py, px = p @ y, p @ x if low_rank: model = LinearMixedModel(py, px, s, y, x, p_path) else: model = LinearMixedModel(py, px, s, p_path=p_path) return model, p # checks agreement of model initialization def _same(self, other, tol=1e-6, up_to_sign=True): def same_rows_up_to_sign(a, b, atol): assert a.shape[0] == b.shape[0] return all( np.allclose(a[i], b[i], atol=atol) or np.allclose(-a[i], b[i], atol=atol) for i in range(a.shape[0])) close = same_rows_up_to_sign if up_to_sign else np.allclose if self.low_rank != other.low_rank: print(f'different low_rank: {self.low_rank}, {other.low_rank}') return False same = True if not close(self.py, other.py, atol=tol): print(f'different py:\n{self.py}\n{other.py}') same = False if not close(self.px, other.px, atol=tol): print(f'different px:\n{self.px}\n{other.px}') same = False if not np.allclose(self.s, other.s, atol=tol): print(f'different s:\n{self.s}\n{other.s}') same = False if self.low_rank and not close(self.y, other.y, atol=tol): print(f'different y:\n{self.y}\n{other.y}') same = False if self.low_rank and not close(self.x, other.x, atol=tol): print(f'different x\n{self.x}\n{other.x}') same = False if self.p_path != other.p_path: print(f'different p_path:\n{self.p_path}\n{other.p_path}') same = False return same
s_ = fmt(s, i) if s_ != s: mapping.append((s, s_)) uniques.add(s_) new_ids.append(s_) if mapping: info(f'Renamed {len(mapping)} duplicate {plural("sample ID", len(mapping))}. Mangled IDs as follows:' + ''.join(f'\n "{pre}" => "{post}"' for pre, post in mapping)) else: info('No duplicate sample IDs found.') return dataset.annotate_cols(**{name: hl.literal(new_ids)[hl.int(hl.scan.count())]}) @typecheck(ds=oneof(Table, MatrixTable), intervals=expr_array(expr_interval(expr_any)), keep=bool) def filter_intervals(ds, intervals, keep=True) -> Union[Table, MatrixTable]: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Parameters ---------- name : :obj:`str` Name of reference. Must be unique and NOT one of Hail's predefined references: ``'GRCh37'``, ``'GRCh38'``, and ``'default'``. contigs : :obj:`list` of :obj:`str` Contig names. lengths : :obj:`dict` of :obj:`str` to :obj:`int` Dict of contig names to contig lengths. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) """ _references = {} @typecheck_method(name=str, contigs=listof(str), lengths=dictof(str, int), x_contigs=oneof(str, listof(str)), y_contigs=oneof(str, listof(str)), mt_contigs=oneof(str, listof(str)), par=listof(sized_tupleof(str, int, int))) def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) par_strings = [ "{}:{}-{}".format(contig, start, end) for (contig, start, end) in par ] jrep = (Env.hail().variant.ReferenceGenome.apply( name, contigs, lengths, x_contigs, y_contigs, mt_contigs, par_strings)) self._init_from_java(jrep) self._name = name self._contigs = contigs self._lengths = lengths self._x_contigs = x_contigs self._y_contigs = y_contigs self._mt_contigs = mt_contigs self._par = None self._par_tuple = par super(ReferenceGenome, self).__init__() ReferenceGenome._references[name] = self def __str__(self): return self._jrep.toString() def __repr__(self): if not self._par_tuple: self._par_tuple = [(x.start.contig, x.start.position, x.end.position) for x in self.par] return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \ (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple) def __eq__(self, other): return isinstance(other, ReferenceGenome) and self._jrep.equals( other._jrep) def __hash__(self): return self._jrep.hashCode() @property def name(self): """Name of reference genome. Returns ------- :obj:`str` """ if self._name is None: self._name = self._jrep.name() return self._name @property def contigs(self): """Contig names. Returns ------- :obj:`list` of :obj:`str` """ if self._contigs is None: self._contigs = [str(x) for x in self._jrep.contigs()] return self._contigs @property def lengths(self): """Dict of contig name to contig length. Returns ------- :obj:`list` of :obj:`str` """ if self._lengths is None: self._lengths = { str(x._1()): int(x._2()) for x in jiterable_to_list(self._jrep.lengths()) } return self._lengths @property def x_contigs(self): """X contigs. Returns ------- :obj:`list` of :obj:`str` """ if self._x_contigs is None: self._x_contigs = [ str(x) for x in jiterable_to_list(self._jrep.xContigs()) ] return self._x_contigs @property def y_contigs(self): """Y contigs. Returns ------- :obj:`list` of :obj:`str` """ if self._y_contigs is None: self._y_contigs = [ str(x) for x in jiterable_to_list(self._jrep.yContigs()) ] return self._y_contigs @property def mt_contigs(self): """Mitochondrial contigs. Returns ------- :obj:`list` of :obj:`str` """ if self._mt_contigs is None: self._mt_contigs = [ str(x) for x in jiterable_to_list(self._jrep.mtContigs()) ] return self._mt_contigs @property def par(self): """Pseudoautosomal regions. Returns ------- :obj:`list` of :class:`.Interval` """ from hail.utils.interval import Interval if self._par is None: self._par = [ Interval._from_java(jrep, hl.tlocus(self)) for jrep in self._jrep.par() ] return self._par @typecheck_method(contig=str) def contig_length(self, contig): """Contig length. Parameters ---------- contig : :obj:`str` Contig name. Returns ------- :obj:`int` Length of contig. """ if contig in self.lengths: return self.lengths[contig] else: raise KeyError( "Contig `{}' is not in reference genome.".format(contig)) @classmethod @typecheck_method(path=str) def read(cls, path): """Load reference genome from a JSON file. Notes ----- The JSON file must have the following format: .. code-block:: text {"name": "my_reference_genome", "contigs": [{"name": "1", "length": 10000000}, {"name": "2", "length": 20000000}, {"name": "X", "length": 19856300}, {"name": "Y", "length": 78140000}, {"name": "MT", "length": 532}], "xContigs": ["X"], "yContigs": ["Y"], "mtContigs": ["MT"], "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}}, {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}] } `name` must be unique and not overlap with Hail's pre-instantiated references: ``'GRCh37'``, ``'GRCh38'``, and ``'default'``.The contig names in `xContigs`, `yContigs`, and `mtContigs` must be present in `contigs`. The intervals listed in `par` must have contigs in either `xContigs` or `yContigs` and must have positions between 0 and the contig length given in `contigs`. Parameters ---------- path : :obj:`str` Path to JSON file. Returns ------- :class:`.ReferenceGenome` """ return ReferenceGenome._from_java( Env.hail().variant.ReferenceGenome.fromFile(Env.hc()._jhc, path)) @typecheck_method(output=str) def write(self, output): """"Write this reference genome to a file in JSON format. Examples -------- >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200}) >>> my_rg.write("output/new_reference.json") Notes ----- Use :class:`~hail.ReferenceGenome.read` to reimport the exported reference genome in a new HailContext session. Parameters ---------- output : :obj:`str` Path of JSON file to write. """ self._jrep.write(Env.hc()._jhc, output) @typecheck_method(fasta_file=str, index_file=str) def add_sequence(self, fasta_file, index_file): """Load the reference sequence from a FASTA file. Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`str` Path to FASTA index file. Must be uncompressed. """ self._jrep.addSequence(Env.hc()._jhc, fasta_file, index_file) def has_sequence(self): """True if the reference sequence has been loaded. Returns ------- :obj:`bool` """ return self._jrep.hasSequence() @classmethod @typecheck_method(name=str, fasta_file=str, index_file=str, x_contigs=oneof(str, listof(str)), y_contigs=oneof(str, listof(str)), mt_contigs=oneof(str, listof(str)), par=listof(sized_tupleof(str, int, int))) def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :obj:`str` Name for new reference genome. fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ return ReferenceGenome._from_java( Env.hail().variant.ReferenceGenome.fromFASTAFile( Env.hc()._jhc, name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par)) def _init_from_java(self, jrep): self._jrep = jrep @classmethod def _from_java(cls, jrep): gr = ReferenceGenome.__new__(cls) gr._init_from_java(jrep) gr._name = None gr._contigs = None gr._lengths = None gr._x_contigs = None gr._y_contigs = None gr._mt_contigs = None gr._par = None gr._par_tuple = None super(ReferenceGenome, gr).__init__() ReferenceGenome._references[gr.name] = gr return gr def _check_locus(self, l_jrep): self._jrep.checkLocus(l_jrep) def _check_interval(self, interval_jrep): self._jrep.checkInterval(interval_jrep)
class tstruct(HailType, Mapping): """Hail type for structured groups of heterogeneous fields. In Python, these are represented as :class:`.Struct`. Hail's :class:`.tstruct` type is commonly used to compose types together to form nested structures. Structs can contain any combination of types, and are ordered mappings from field name to field type. Each field name must be unique. Structs are very common in Hail. Each component of a :class:`.Table` and :class:`.MatrixTable` is a struct: - :meth:`.Table.row` - :meth:`.Table.globals` - :meth:`.MatrixTable.row` - :meth:`.MatrixTable.col` - :meth:`.MatrixTable.entry` - :meth:`.MatrixTable.globals` Structs appear below the top-level component types as well. Consider the following join: >>> new_table = table1.annotate(table2_fields = table2.index(table1.key)) This snippet adds a field to ``table1`` called ``table2_fields``. In the new table, ``table2_fields`` will be a struct containing all the non-key fields from ``table2``. Parameters ---------- field_types : keyword args of :class:`.HailType` Fields. See Also -------- :class:`.StructExpression`, :class:`.Struct` """ @typecheck_method(field_types=hail_type) def __init__(self, **field_types): self._field_types = field_types self._fields = tuple(field_types) super(tstruct, self).__init__() @property def types(self): """Struct field types. Returns ------- :obj:`tuple` of :class:`.HailType` """ return tuple(self._field_types.values()) @property def fields(self): """Struct field names. Returns ------- :obj:`tuple` of :class:`str` Tuple of struct field names. """ return self._fields def _traverse(self, obj, f): if f(self, obj): for k, v in obj.items(): t = self[k] t._traverse(v, f) def _typecheck_one_level(self, annotation): if annotation: if isinstance(annotation, Mapping): s = set(self) for f in annotation: if f not in s: raise TypeError("type '%s' expected fields '%s', but found fields '%s'" % (self, list(self), list(annotation))) else: raise TypeError("type 'struct' expected type Mapping (e.g. dict or hail.utils.Struct), but found '%s'" % type(annotation)) @typecheck_method(item=oneof(int, str)) def __getitem__(self, item): if not isinstance(item, str): item = self._fields[item] return self._field_types[item] def __iter__(self): return iter(self._field_types) def __len__(self): return len(self._fields) def __str__(self): return "struct{{{}}}".format( ', '.join('{}: {}'.format(escape_parsable(f), str(t)) for f, t in self.items())) def _eq(self, other): return (isinstance(other, tstruct) and self._fields == other._fields and all(self[f] == other[f] for f in self._fields)) def _pretty(self, b, indent, increment): if not self._fields: b.append('struct {}') return pre_indent = indent indent += increment b.append('struct {') for i, (f, t) in enumerate(self.items()): if i > 0: b.append(', ') b.append('\n') b.append(' ' * indent) b.append('{}: '.format(escape_parsable(f))) t._pretty(b, indent, increment) b.append('\n') b.append(' ' * pre_indent) b.append('}') def _parsable_string(self): return "Struct{{{}}}".format( ','.join('{}:{}'.format(escape_parsable(f), t._parsable_string()) for f, t in self.items())) def _convert_from_json(self, x): from hail.utils import Struct return Struct(**{f: t._convert_from_json_na(x.get(f)) for f, t in self.items()}) def _convert_to_json(self, x): return {f: t._convert_to_json_na(x[f]) for f, t in self.items()} def _is_prefix_of(self, other): return (isinstance(other, tstruct) and len(self._fields) <= len(other._fields) and all(x == y for x, y in zip(self._field_types.values(), other._field_types.values()))) def _concat(self, other): new_field_types = {} new_field_types.update(self._field_types) new_field_types.update(other._field_types) return tstruct(**new_field_types) def _insert(self, path, t): if not path: return t key = path[0] keyt = self.get(key) if not (keyt and isinstance(keyt, tstruct)): keyt = tstruct() return self._insert_fields(**{key: keyt._insert(path[1:], t)}) def _insert_field(self, field, typ): return self._insert_fields(**{field: typ}) def _insert_fields(self, **new_fields): new_field_types = {} new_field_types.update(self._field_types) new_field_types.update(new_fields) return tstruct(**new_field_types) def _drop_fields(self, fields): return tstruct(**{f: t for f, t in self.items() if f not in fields}) def _select_fields(self, fields): return tstruct(**{f: self[f] for f in fields}) def _index_path(self, path): t = self for p in path: t = t[p] return t def _rename(self, map): seen = {} new_field_types = {} for f0, t in self.items(): f = map.get(f0, f0) if f in seen: raise ValueError( "Cannot rename two fields to the same name: attempted to rename {} and {} both to {}".format( repr(seen[f]), repr(f0), repr(f))) else: seen[f] = f0 new_field_types[f] = t return tstruct(**new_field_types) def unify(self, t): if not (isinstance(t, tstruct) and len(self) == len(t)): return False for (f1, t1), (f2, t2) in zip(self.items(), t.items()): if not (f1 == f2 and t1.unify(t2)): return False return True def subst(self): return tstruct(**{f: t.subst() for f, t in self.items()}) def clear(self): for f, t in self.items(): t.clear() def _get_context(self): return HailTypeContext.union(*self.values())
ht = ht.drop(*fields) ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp) @typecheck(ht=Table, field=str, value=str, key=nullable(oneof(str, sequenceof(str)))) def spread(ht, field, value, key=None) -> Table: """Spread a key-value pair of fields across multiple fields. :func:`.spread` mimics the functionality of the `spread()` function in R's `tidyr` package. This is a way to turn "long" format data into "wide" format data. Given a ``field``, :func:`.spread` will create a new table by grouping ``ht`` by its row key and, optionally, any additional fields passed to the ``key`` argument. After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field for each unique value of ``field``, where the row field values are given by the corresponding ``value`` in the original ``ht``.
>>> hl.eval(hl.nd.array(hl.range(10, 20))) array([10, 11, 12, 13, 14, 15, 16, 17, 18, 19], dtype=int32) Parameters ---------- input_array : :class:`.ArrayExpression` or numpy ndarray or nested python lists Returns ------- :class:`.NDArrayExpression` An ndarray based on the input array. """ return _ndarray(input_array) shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple()) @typecheck(a=expr_array(), shape=shape_type) def from_column_major(a, shape): assert len(shape) == 2 return array(a).reshape(tuple(reversed(shape))).T @typecheck(start=expr_int32, stop=nullable(expr_int32), step=expr_int32) def arange(start, stop=None, step=1) -> NDArrayNumericExpression: """Returns a 1-dimensions ndarray of integers from `start` to `stop` by `step`. Examples --------
raise NotImplementedError @abc.abstractmethod def clear(self): raise NotImplementedError def _get_context(self): return _empty_context def get_context(self): if self._context is None: self._context = self._get_context() return self._context hail_type = oneof(HailType, transformed((str, dtype))) class _tvoid(HailType): def __init__(self): super(_tvoid, self).__init__() def __str__(self): return "void" def _eq(self, other): return isinstance(other, _tvoid) def _parsable_string(self): return "Void"
L = k + 2 else: L = iteration_size assert ((q + 1) * L >= k) n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_U) info("_reduced_svd: Computing local SVD") return fact.reduced_svd(k) @typecheck(A=oneof(expr_float64, TallSkinnyMatrix), num_moments=int, p=nullable(int), moment_samples=int, block_size=int) def _spectral_moments(A, num_moments, p=None, moment_samples=500, block_size=128): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) n = A.ncols
class tunion(HailType, Mapping): @typecheck_method(case_types=hail_type) def __init__(self, **case_types): """Tagged union type. Values of type union represent one of several heterogenous, named cases. Parameters ---------- cases : keyword args of :class:`.HailType` The union cases. """ super(tunion, self).__init__() self._case_types = case_types self._cases = tuple(case_types) @property def cases(self): """Return union case names. Returns ------- :obj:`tuple` of :obj:`str` Tuple of union case names """ return self._cases @typecheck_method(item=oneof(int, str)) def __getitem__(self, item): if isinstance(item, int): item = self._cases[item] return self._case_types[item] def __iter__(self): return iter(self._case_types) def __len__(self): return len(self._cases) def __str__(self): return "union{{{}}}".format(', '.join( '{}: {}'.format(escape_parsable(f), str(t)) for f, t in self.items())) def _eq(self, other): return (isinstance(other, tunion) and self._cases == other._cases and all(self[c] == other[c] for c in self._cases)) def _pretty(self, l, indent, increment): if not self._cases: l.append('union {}') return pre_indent = indent indent += increment l.append('union {') for i, (f, t) in enumerate(self.items()): if i > 0: l.append(', ') l.append('\n') l.append(' ' * indent) l.append('{}: '.format(escape_parsable(f))) t._pretty(l, indent, increment) l.append('\n') l.append(' ' * pre_indent) l.append('}') def _parsable_string(self): return "Union{{{}}}".format(','.join( '{}:{}'.format(escape_parsable(f), t._parsable_string()) for f, t in self.items())) def unify(self, t): if not (isinstance(t, tunion) and len(self) == len(t)): return False for (f1, t1), (f2, t2) in zip(self.items(), t.items()): if not (f1 == f2 and t1.unify(t2)): return False return True def subst(self): return tunion(**{f: t.subst() for f, t in self.items()}) def clear(self): for f, t in self.items(): t.clear() def _get_context(self): return HailTypeContext.union(*self.values())
pct = on_diag / total_obs * 100 if total_obs > 0 else float('nan') info(f"concordance: total concordance {pct:.2f}%") per_variant = joined.annotate_rows(concordance=aggr) per_variant = per_variant.select_rows( concordance=concordance_array(per_variant.concordance), n_discordant=n_discordant(per_variant.concordance)) per_sample = joined.annotate_cols(concordance=aggr) per_sample = per_sample.select_cols( concordance=concordance_array(per_sample.concordance), n_discordant=n_discordant(per_sample.concordance)) return glob, per_sample.cols(), per_variant.rows() @typecheck(dataset=oneof(Table, MatrixTable), config=str, block_size=int, name=str, csq=bool) def vep(dataset: Union[Table, MatrixTable], config, block_size=1000, name='vep', csq=False): """Annotate variants with VEP. .. include:: ../_templates/req_tvariant.rst :func:`.vep` runs `Variant Effect Predictor <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ on the
@author: nbaya """ import hail as hl from hail.typecheck import typecheck, oneof, nullable from hail.expr.expressions import expr_float64, expr_int32, expr_array, expr_call from hail.matrixtable import MatrixTable from hail.table import Table from hail.utils.java import Env import numpy as np import pandas as pd import scipy.stats as stats @typecheck(mt=MatrixTable, genotype=oneof(expr_int32, expr_float64, expr_call), h2=(oneof(float, int, list, np.ndarray)), pi=nullable(oneof(float, int, list, np.ndarray)), rg=nullable(oneof(float, int, list, np.ndarray)), annot=nullable(oneof(expr_float64, expr_int32)), popstrat=nullable(oneof(expr_int32, expr_float64)), popstrat_var=nullable(oneof(float, int)), exact_h2=bool) def simulate_phenotypes(mt, genotype, h2, pi=None, rg=None, annot=None, popstrat=None, popstrat_var=None,
.or_error("'filter_samples': unexpected local allele: old index=" + hl.str(old_idx)) vd = vd.annotate_entries(LA=vd.LA.map(lambda la: new_la_index(la))) vd = vd.key_rows_by('locus') vd = vd.annotate_rows( alleles=vd.__kept_indices.keys().map(lambda i: vd.alleles[i])) vd = vd._key_rows_by_assert_sorted('locus', 'alleles') vd = vd.drop('__allele_counts', '__kept_indices', '__old_to_new_LA') return VariantDataset(reference_data, vd) variant_data = variant_data.filter_rows(hl.agg.count() > 0) return VariantDataset(reference_data, variant_data) @typecheck(vds=VariantDataset, calling_intervals=oneof(Table, expr_array(expr_interval(expr_locus()))), normalization_contig=str) def impute_sex_chromosome_ploidy(vds: VariantDataset, calling_intervals, normalization_contig: str) -> hl.Table: """Impute sex chromosome ploidy from depth of reference data within calling intervals. Returns a :class:`.Table` with sample ID keys, with the following fields: - ``autosomal_mean_dp`` (*float64*): Mean depth on calling intervals on normalization contig. - ``x_mean_dp`` (*float64*): Mean depth on calling intervals on X chromosome. - ``x_ploidy`` (*float64*): Estimated ploidy on X chromosome. Equal to ``2 * x_mean_dp / autosomal_mean_dp``. - ``y_mean_dp`` (*float64*): Mean depth on calling intervals on chromosome. - ``y_ploidy`` (*float64*): Estimated ploidy on Y chromosome. Equal to ``2 * y_mean_db / autosomal_mean_dp``. Parameters ----------