Ejemplo n.º 1
0
class TableFromBlockMatrixNativeReader(TableReader):
    @typecheck_method(path=str,
                      n_partitions=nullable(int),
                      maximum_cache_memory_in_bytes=nullable(int))
    def __init__(self, path, n_partitions, maximum_cache_memory_in_bytes):
        self.path = path
        self.n_partitions = n_partitions
        self.maximum_cache_memory_in_bytes = maximum_cache_memory_in_bytes

    def render(self):
        reader = {
            'name': 'TableFromBlockMatrixNativeReader',
            'path': self.path,
            'nPartitions': self.n_partitions,
            'maximumCacheMemoryInBytes': self.maximum_cache_memory_in_bytes
        }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, TableFromBlockMatrixNativeReader) and \
            other.path == self.path and \
            other.n_partitions == self.n_partitions and \
            other.maximum_cache_memory_in_bytes == self.maximum_cache_memory_in_bytes
Ejemplo n.º 2
0
class TableNativeReader(TableReader):
    @typecheck_method(path=str,
                      intervals=nullable(sequenceof(anytype)),
                      filter_intervals=bool)
    def __init__(self, path, intervals, filter_intervals):
        if intervals is not None:
            t = hl.expr.impute_type(intervals)
            if not isinstance(t, hl.tarray) and not isinstance(
                    t.element_type, hl.tinterval):
                raise TypeError("'intervals' must be an array of tintervals")
            pt = t.element_type.point_type
            if isinstance(pt, hl.tstruct):
                self._interval_type = t
            else:
                self._interval_type = hl.tarray(
                    hl.tinterval(hl.tstruct(__point=pt)))

        self.path = path
        self.filter_intervals = filter_intervals
        if intervals is not None and t != self._interval_type:
            self.intervals = [
                hl.Interval(hl.Struct(__point=i.start),
                            hl.Struct(__point=i.end), i.includes_start,
                            i.includes_end) for i in intervals
            ]
        else:
            self.intervals = intervals

    def render(self):
        reader = {'name': 'TableNativeReader', 'path': self.path}
        if self.intervals is not None:
            assert self._interval_type is not None
            reader['options'] = {
                'name':
                'NativeReaderOptions',
                'intervals':
                self._interval_type._convert_to_json(self.intervals),
                'intervalPointType':
                self._interval_type.element_type.point_type._parsable_string(),
                'filterIntervals':
                self.filter_intervals
            }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, TableNativeReader) and \
            other.path == self.path and \
            other.intervals == self.intervals and \
            other.filter_intervals == self.filter_intervals
Ejemplo n.º 3
0
Archivo: typ.py Proyecto: wtsi-hgi/hail
class TLocus(Type):
    """
    Hail type corresponding to :class:`hail.representation.Locus`.

    .. include:: hailType.rst

    - `expression language documentation <types.html#locus>`__
    - in Python, values are instances of :class:`hail.representation.Locus`

    :param reference_genome: Reference genome to use. Default is :class:`~.HailContext.default_reference`.
    :type reference_genome: :class:`.GenomeReference`

    """

    @typecheck_method(reference_genome=nullable(GenomeReference),
                      required=bool)
    def __init__(self, reference_genome=None, required = False):
        self._rg = reference_genome if reference_genome else Env.hc().default_reference
        jtype = scala_object(Env.hail().expr, 'TLocus').apply(self._rg._jrep, required)
        super(TLocus, self).__init__(jtype)

    @classmethod
    def _from_java(cls, jtype):
        l = TLocus.__new__(cls)
        l._jtype = jtype
        l.required = jtype.required()
        return l

    def _convert_to_py(self, annotation):
        if annotation:
            return Locus._from_java(annotation)
        else:
            return annotation

    def _convert_to_j(self, annotation):
        if annotation is not None:
            return annotation._jrep
        else:
            return annotation

    def _typecheck(self, annotation):
        if not annotation and self.required:
            raise TypeCheckError('!TLocus cannot be missing')
        if annotation and not isinstance(annotation, Locus):
            raise TypeCheckError('TLocus expected type hail.representation.Locus, but found %s' %
                                 type(annotation))

    def _repr(self):
        return "TLocus()"
Ejemplo n.º 4
0
class TableFromBlockMatrixNativeReader(TableReader):
    @typecheck_method(path=str, n_partitions=nullable(int))
    def __init__(self, path, n_partitions):
        self.path = path
        self.n_partitions = n_partitions

    def render(self):
        reader = {
            'name': 'TableFromBlockMatrixNativeReader',
            'path': self.path,
            'nPartitions': self.n_partitions
        }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, TableFromBlockMatrixNativeReader) and \
            other.path == self.path and \
            other.n_partitions == self.n_partitions
Ejemplo n.º 5
0
class StringTableReader(TableReader):
    @typecheck_method(paths=oneof(str, sequenceof(str)),
                      min_partitions=nullable(int))
    def __init__(self, paths, min_partitions):
        self.paths = paths
        self.min_partitions = min_partitions

    def render(self):
        reader = {
            'name': 'StringTableReader',
            'files': self.paths,
            'minPartitions': self.min_partitions
        }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, StringTableReader) and \
            other.path == self.path and \
            other.min_partitions == self.min_partitions
Ejemplo n.º 6
0
class Function(object):
    def __init__(self, f, param_types, ret_type, name, type_args=()):
        self._f = f
        self._name = name
        self._type_args = type_args
        self._param_types = param_types
        self._ret_type = ret_type

    def __call__(self, *args):
        return self._f(*args)


@typecheck(f=anytype,
           param_types=hail_type,
           _name=nullable(str),
           type_args=tupleof(hail_type))
def define_function(f, *param_types, _name=None, type_args=()):
    mname = _name if _name is not None else Env.get_uid()
    param_names = [Env.get_uid() for _ in param_types]
    body = f(*(construct_expr(Ref(pn), pt)
               for pn, pt in zip(param_names, param_types)))
    ret_type = body.dtype

    r = CSERenderer(stop_at_jir=True)
    code = r(body._ir)
    jbody = body._ir.parse(code,
                           ref_map=dict(zip(param_names, param_types)),
                           ir_map=r.jirs)

    Env.hail().expr.ir.functions.IRFunctionRegistry.pyRegisterIR(
Ejemplo n.º 7
0
import hail as hl
import hail.expr.aggregators as agg
from hail.expr import (expr_float64, expr_call, expr_array, analyze,
                       matrix_table_source)
from hail.expr.types import tarray
from hail import ir
from hail.linalg import BlockMatrix
from hail.table import Table
from hail.typecheck import typecheck, nullable, numeric, enumeration

from ..pca import hwe_normalized_pca


@typecheck(call_expr=expr_call,
           min_individual_maf=numeric,
           k=nullable(int),
           scores_expr=nullable(expr_array(expr_float64)),
           min_kinship=nullable(numeric),
           statistics=enumeration('kin', 'kin2', 'kin20', 'all'),
           block_size=nullable(int),
           include_self_kinship=bool)
def pc_relate(call_expr,
              min_individual_maf,
              *,
              k=None,
              scores_expr=None,
              min_kinship=None,
              statistics="all",
              block_size=None,
              include_self_kinship=False) -> Table:
    r"""Compute relatedness estimates between individuals using a variant of the
Ejemplo n.º 8
0
    @property
    def default_reference(self) -> ReferenceGenome:
        assert self._default_ref is not None, '_default_ref should have been initialized in HailContext.create'
        return self._default_ref

    def stop(self):
        self._backend.stop()
        self._backend = None
        Env._hc = None
        Env._dummy_table = None
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=nullable(str),
           default_reference=enumeration(*BUILTIN_REFERENCES),
           idempotent=bool,
           global_seed=nullable(int),
           spark_conf=nullable(dictof(str, str)),
           skip_logging_configuration=bool,
           local_tmpdir=nullable(str),
Ejemplo n.º 9
0
class ReferenceGenome(object):
    """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__.

    Examples
    --------

    >>> contigs = ["1", "X", "Y", "MT"]
    >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569}
    >>> par = [("X", 60001, 2699521)]
    >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par)

    Notes
    -----
    Hail comes with predefined reference genomes (case sensitive!):

     - GRCh37, Genome Reference Consortium Human Build 37
     - GRCh38, Genome Reference Consortium Human Build 38
     - GRCm38, Genome Reference Consortium Mouse Build 38
     - CanFam3, Canis lupus familiaris (dog)

    You can access these reference genome objects using :func:`~hail.get_reference`:

    >>> rg = hl.get_reference('GRCh37')
    >>> rg = hl.get_reference('GRCh38')
    >>> rg = hl.get_reference('GRCm38')
    >>> rg = hl.get_reference('CanFam3')

    Note that constructing a new reference genome, either by using the class
    constructor or by using `read` will add the reference genome to the list of
    known references; it is possible to access the reference genome using
    :func:`~hail.get_reference` anytime afterwards.

    Note
    ----
    Reference genome names must be unique. It is not possible to overwrite the
    built-in reference genomes.

    Parameters
    ----------
    name : :class:`str`
        Name of reference. Must be unique and NOT one of Hail's
        predefined references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``,
        ``'CanFam3'`` and ``'default'``.
    contigs : :obj:`list` of :class:`str`
        Contig names.
    lengths : :obj:`dict` of :class:`str` to :obj:`int`
        Dict of contig names to contig lengths.
    x_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as X chromosomes.
    y_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as Y chromosomes.
    mt_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as mitochondrial DNA.
    par : :obj:`list` of :obj:`tuple` of (str, int, int)
        List of tuples with (contig, start, end)
    """

    _references = {}

    @classmethod
    def _from_config(cls, config, _builtin=False):
        def par_tuple(p):
            assert p['start']['contig'] == p['end']['contig']
            return (p['start']['contig'], p['start']['position'],
                    p['end']['position'])

        contigs = config['contigs']
        return ReferenceGenome(config['name'], [c['name'] for c in contigs],
                               {c['name']: c['length']
                                for c in contigs}, config['xContigs'],
                               config['yContigs'], config['mtContigs'],
                               [par_tuple(p) for p in config['par']], _builtin)

    @typecheck_method(name=str,
                      contigs=sequenceof(str),
                      lengths=dictof(str, int),
                      x_contigs=oneof(str, sequenceof(str)),
                      y_contigs=oneof(str, sequenceof(str)),
                      mt_contigs=oneof(str, sequenceof(str)),
                      par=sequenceof(sized_tupleof(str, int, int)),
                      _builtin=bool)
    def __init__(self,
                 name,
                 contigs,
                 lengths,
                 x_contigs=[],
                 y_contigs=[],
                 mt_contigs=[],
                 par=[],
                 _builtin=False):
        super(ReferenceGenome, self).__init__()

        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name':
            name,
            'contigs': [{
                'name': c,
                'length': l
            } for c, l in lengths.items()],
            'xContigs':
            x_contigs,
            'yContigs':
            y_contigs,
            'mtContigs':
            mt_contigs,
            'par': [{
                'start': {
                    'contig': c,
                    'position': s
                },
                'end': {
                    'contig': c,
                    'position': e
                }
            } for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [
            hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self))
            for (c, s, e) in par
        ]
        self._global_positions = None

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        self._sequence_files = None
        self._liftovers = dict()

    def __str__(self):
        return self._config['name']

    def __repr__(self):
        return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \
               (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple)

    def __eq__(self, other):
        return isinstance(other,
                          ReferenceGenome) and self._config == other._config

    def __hash__(self):
        return hash(self.name)

    @property
    def name(self):
        """Name of reference genome.

        Returns
        -------
        :class:`str`
        """
        return self._config['name']

    @property
    def contigs(self):
        """Contig names.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._contigs

    @property
    def lengths(self):
        """Dict of contig name to contig length.

        Returns
        -------
        :obj:`dict` of :class:`str` to :obj:`int`
        """
        return self._lengths

    @property
    def x_contigs(self):
        """X contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['xContigs']

    @property
    def y_contigs(self):
        """Y contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['yContigs']

    @property
    def mt_contigs(self):
        """Mitochondrial contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['mtContigs']

    @property
    def par(self):
        """Pseudoautosomal regions.

        Returns
        -------
        :obj:`list` of :class:`.Interval`
        """

        return self._par

    @typecheck_method(contig=str)
    def contig_length(self, contig):
        """Contig length.

        Parameters
        ----------
        contig : :class:`str`
            Contig name.

        Returns
        -------
        :obj:`int`
            Length of contig.
        """
        if contig in self.lengths:
            return self.lengths[contig]
        else:
            raise KeyError(
                "Contig `{}' is not in reference genome.".format(contig))

    @typecheck_method(contig=str)
    def _contig_global_position(self, contig):
        if self._global_positions is None:
            gp = {}
            lengths = self._lengths
            x = 0
            for c in self.contigs:
                gp[c] = x
                x += lengths[c]
            self._global_positions = gp
        return self._global_positions[contig]

    @classmethod
    @typecheck_method(path=str)
    def read(cls, path):
        """Load reference genome from a JSON file.

        Notes
        -----

        The JSON file must have the following format:

        .. code-block:: text

            {"name": "my_reference_genome",
             "contigs": [{"name": "1", "length": 10000000},
                         {"name": "2", "length": 20000000},
                         {"name": "X", "length": 19856300},
                         {"name": "Y", "length": 78140000},
                         {"name": "MT", "length": 532}],
             "xContigs": ["X"],
             "yContigs": ["Y"],
             "mtContigs": ["MT"],
             "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}},
                     {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}]
            }


        `name` must be unique and not overlap with Hail's pre-instantiated
        references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'``, and
        ``'default'``.
        The contig names in `xContigs`, `yContigs`, and `mtContigs` must be
        present in `contigs`. The intervals listed in `par` must have contigs in
        either `xContigs` or `yContigs` and must have positions between 0 and
        the contig length given in `contigs`.

        Parameters
        ----------
        path : :class:`str`
            Path to JSON file.

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        with hl.hadoop_open(path) as f:
            return ReferenceGenome._from_config(json.load(f))

    @typecheck_method(output=str)
    def write(self, output):
        """"Write this reference genome to a file in JSON format.

        Examples
        --------

        >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200})
        >>> my_rg.write(f"output/new_reference.json")

        Notes
        -----

        Use :meth:`~hail.genetics.ReferenceGenome.read` to reimport the exported
        reference genome in a new HailContext session.

        Parameters
        ----------
        output : :class:`str`
            Path of JSON file to write.
        """
        with hl.utils.hadoop_open(output, 'w') as f:
            json.dump(self._config, f)

    @typecheck_method(fasta_file=str, index_file=nullable(str))
    def add_sequence(self, fasta_file, index_file=None):
        """Load the reference sequence from a FASTA file.

        Examples
        --------
        Access the GRCh37 reference genome using :func:`~hail.get_reference`:

        >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP

        Add a sequence file:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz',
        ...                 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP

        Add a sequence file with the default index location:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP


        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_sequence` to test whether a sequence is loaded.

        FASTA and index files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37**

        - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz``
        - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai``

        **GRCh38**

        - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz``
        - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai``

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        fasta_file : :class:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`None` or :class:`str`
            Path to FASTA index file. Must be uncompressed. If `None`, replace
            the fasta_file's extension with `fai`.
        """
        if index_file is None:
            index_file = re.sub(r'\.[^.]*$', '.fai', fasta_file)
        Env.backend().add_sequence(self.name, fasta_file, index_file)
        self._sequence_files = (fasta_file, index_file)

    def has_sequence(self):
        """True if the reference sequence has been loaded.

        Returns
        -------
        :obj:`bool`
        """
        return self._sequence_files is not None

    def remove_sequence(self):
        """Remove the reference sequence."""
        self._sequence_files = None
        Env.backend().remove_sequence(self.name)

    @classmethod
    @typecheck_method(name=str,
                      fasta_file=str,
                      index_file=str,
                      x_contigs=oneof(str, sequenceof(str)),
                      y_contigs=oneof(str, sequenceof(str)),
                      mt_contigs=oneof(str, sequenceof(str)),
                      par=sequenceof(sized_tupleof(str, int, int)))
    def from_fasta_file(cls,
                        name,
                        fasta_file,
                        index_file,
                        x_contigs=[],
                        y_contigs=[],
                        mt_contigs=[],
                        par=[]):
        """Create reference genome from a FASTA file.

        Parameters
        ----------
        name: :class:`str`
            Name for new reference genome.
        fasta_file : :class:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :class:`str`
            Path to FASTA index file. Must be uncompressed.
        x_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as X chromosomes.
        y_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as Y chromosomes.
        mt_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as mitochondrial DNA.
        par : :obj:`list` of :obj:`tuple` of (str, int, int)
            List of tuples with (contig, start, end)

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        par_strings = [
            "{}:{}-{}".format(contig, start, end)
            for (contig, start, end) in par
        ]
        Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs,
                                      y_contigs, mt_contigs, par_strings)

        rg = ReferenceGenome._from_config(Env.backend().get_reference(name),
                                          _builtin=True)
        rg._sequence_files = (fasta_file, index_file)
        return rg

    @typecheck_method(dest_reference_genome=reference_genome_type)
    def has_liftover(self, dest_reference_genome):
        """``True`` if a liftover chain file is available from this reference
        genome to the destination reference.

        Parameters
        ----------
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`

        Returns
        -------
        :obj:`bool`
        """
        return dest_reference_genome.name in self._liftovers

    @typecheck_method(dest_reference_genome=reference_genome_type)
    def remove_liftover(self, dest_reference_genome):
        """Remove liftover to `dest_reference_genome`.

        Parameters
        ----------
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`
        """
        if dest_reference_genome.name in self._liftovers:
            del self._liftovers[dest_reference_genome.name]
            Env.backend().remove_liftover(self.name,
                                          dest_reference_genome.name)

    @typecheck_method(chain_file=str,
                      dest_reference_genome=reference_genome_type)
    def add_liftover(self, chain_file, dest_reference_genome):
        """Register a chain file for liftover.

        Examples
        --------
        Access GRCh37 and GRCh38 using :func:`~hail.get_reference`:

        >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP
        >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP

        Add a chain file from 37 to 38:

        >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP

        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_liftover` to test whether a chain file has been registered.

        The chain file format is described
        `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__.

        Chain files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37 to GRCh38**
        gs://hail-common/references/grch37_to_grch38.over.chain.gz

        **GRCh38 to GRCh37**
        gs://hail-common/references/grch38_to_grch37.over.chain.gz

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        chain_file : :class:`str`
            Path to chain file. Can be compressed (GZIP) or uncompressed.
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`
            Reference genome to convert to.
        """

        Env.backend().add_liftover(self.name, chain_file,
                                   dest_reference_genome.name)
        if dest_reference_genome.name in self._liftovers:
            raise KeyError(
                f"Liftover already exists from {self.name} to {dest_reference_genome.name}."
            )
        self._liftovers[dest_reference_genome.name] = chain_file
Ejemplo n.º 10
0
import numpy as np
import hail as hl
from hail.table import Table
from hail.linalg import BlockMatrix
from hail.typecheck import typecheck, nullable, sequenceof, oneof
from hail.expr.expressions import expr_float64, expr_numeric, expr_locus
from hail.utils import new_temp_file, wrap_to_list


@typecheck(entry_expr=expr_float64,
           locus_expr=expr_locus(),
           radius=oneof(int, float),
           coord_expr=nullable(expr_float64),
           annotation_exprs=nullable(oneof(expr_numeric,
                                           sequenceof(expr_numeric))),
           block_size=nullable(int))
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
Ejemplo n.º 11
0
class Pedigree(object):
    """Class containing a list of trios, with extra functionality.

    :param trios: list of trio objects to include in pedigree
    :type trios: list of :class:`.Trio`
    """
    @typecheck_method(trios=sequenceof(Trio))
    def __init__(self, trios):
        self._trios = tuple(trios)

    def __eq__(self, other):
        return isinstance(other, Pedigree) and self._trios == other._trios

    def __hash__(self):
        return hash(self._trios)

    def __iter__(self):
        return self._trios.__iter__()

    @classmethod
    @typecheck_method(fam_path=str, delimiter=str)
    def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree':
        """Read a PLINK .fam file and return a pedigree object.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')

        Notes
        -------

        See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for
        the required format.

        :param str fam_path: path to .fam file.

        :param str delimiter: Field delimiter.

        :rtype: :class:`.Pedigree`
        """

        trios = []
        missing_sex_count = 0
        missing_sex_values = set()
        with Env.fs().open(fam_path) as file:
            for line in file:
                split_line = re.split(delimiter, line.strip())
                num_fields = len(split_line)
                if num_fields != 6:
                    raise FatalError(
                        "Require 6 fields per line in .fam, but this line has {}: {}"
                        .format(num_fields, line))
                (fam, kid, dad, mom, sex, _) = tuple(split_line)
                # 1 is male, 2 is female, 0 is unknown.
                is_female = sex == "2" if sex == "1" or sex == "2" else None

                if is_female is None:
                    missing_sex_count += 1
                    missing_sex_values.add(kid)

                trio = Trio(kid, fam if fam != "0" else None,
                            dad if dad != "0" else None,
                            mom if mom != "0" else None, is_female)
                trios.append(trio)

        only_ids = [trio.s for trio in trios]
        duplicate_ids = [
            id for id, count in Counter(only_ids).items() if count > 1
        ]
        if duplicate_ids:
            raise FatalError(
                "Invalid pedigree: found duplicate proband IDs\n{}".format(
                    duplicate_ids))

        if missing_sex_count > 0:
            warning(
                "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]"
                .format(missing_sex_count, missing_sex_values))

        return Pedigree(trios)

    @property
    def trios(self):
        """List of trio objects in this pedigree.

        :rtype: list of :class:`.Trio`
        """
        return self._trios

    def complete_trios(self):
        """List of trio objects that have a defined father and mother.

        :rtype: list of :class:`.Trio`
        """
        return list(filter(lambda t: t.is_complete(), self.trios))

    @typecheck_method(samples=sequenceof(nullable(str)))
    def filter_to(self, samples):
        """Filter the pedigree to a given list of sample IDs.

        **Notes**

        For any trio, the following steps will be applied:

         - If the proband is not in the list of samples provided, the trio is removed.
         - If the father is not in the list of samples provided, `pat_id` is set to ``None``.
         - If the mother is not in the list of samples provided, `mat_id` is set to ``None``.

        :param samples: list of sample IDs to keep
        :type samples: list of str

        :rtype: :class:`.Pedigree`
        """
        sample_set = set(samples)

        filtered_trios = []
        for trio in self._trios:
            restricted_trio = trio._restrict_to(sample_set)
            if restricted_trio is not None:
                filtered_trios.append(restricted_trio)

        return Pedigree(filtered_trios)

    @typecheck_method(path=str)
    def write(self, path):
        """Write a .fam file to the given path.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')
        >>> ped.write('output/out.fam')

        **Notes**

        This method writes a `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_.

        .. caution::

            Phenotype information is not preserved in the Pedigree data structure in Hail.
            Reading and writing a PLINK .fam file will result in loss of this information.
            Use the key table method :meth:`~hail.KeyTable.import_fam` to manipulate this
            information.

        :param path: output path
        :type path: str
        """

        lines = [t._to_fam_file_line() for t in self._trios]

        with Env.fs().open(path, mode="w") as file:
            for line in lines:
                file.write(line + "\n")
Ejemplo n.º 12
0
    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None
        Env._seed_generator = None

    def upload_log(self):
        self._jhc.uploadLog()


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38'),
           idempotent=bool,
           global_seed=nullable(int),
           _backend=nullable(Backend))
def init(sc=None,
         app_name='Hail',
Ejemplo n.º 13
0
from hail.expr.expressions import construct_expr, anytype, expr_any, unify_all
from hail.typecheck import typecheck, nullable


class Function(object):
    def __init__(self, f, param_types, ret_type, name):
        self._f = f
        self._name = name
        self._param_types = param_types
        self._ret_type = ret_type

    def __call__(self, *args):
        return self._f(*args)


@typecheck(f=anytype, param_types=hail_type, _name=nullable(str))
def define_function(f, *param_types, _name=None):
    mname = _name if _name is not None else Env.get_uid()
    param_names = [Env.get_uid() for _ in param_types]
    body = f(*(construct_expr(Ref(pn), pt)
               for pn, pt in zip(param_names, param_types)))
    ret_type = body.dtype

    r = CSERenderer(stop_at_jir=True)
    code = r(body._ir)
    jbody = body._ir.parse(code,
                           ref_map=dict(zip(param_names, param_types)),
                           ir_map=r.jirs)

    Env.hail().expr.ir.functions.IRFunctionRegistry.pyRegisterIR(
        mname, param_names, [pt._parsable_string() for pt in param_types],
Ejemplo n.º 14
0
    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38'),
           idempotent=bool,
           global_seed=nullable(int),
           _optimizer_iterations=nullable(int),
           _backend=nullable(Backend))
def init(sc=None, app_name='Hail', master=None, local='local[*]',
Ejemplo n.º 15
0
import hail as hl
from hail.expr.expressions import expr_float64, expr_numeric, analyze
from hail.typecheck import typecheck, oneof, sequenceof, nullable
from hail.table import Table
from hail.matrixtable import MatrixTable
from hail.utils import wrap_to_list, new_temp_file
import numpy as np


@typecheck(weight_expr=expr_float64,
           ld_score_expr=expr_numeric,
           chi_sq_exprs=oneof(expr_float64, sequenceof(expr_float64)),
           n_samples_exprs=oneof(expr_numeric, sequenceof(expr_numeric)),
           n_blocks=int,
           two_step_threshold=int,
           n_reference_panel_variants=nullable(int))
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:
Ejemplo n.º 16
0
class LinearMixedModel(object):
    r"""Class representing a linear mixed model.

    .. include:: ../_templates/experimental.rst

    :class:`LinearMixedModel` represents a linear model of the form

    .. math::

        y \sim \mathrm{N}(X \beta, \, \sigma^2 K + \tau^2 I)

    where

    - :math:`\mathrm{N}` is a :math:`n`-dimensional normal distribution.
    - :math:`y` is a known vector of :math:`n` observations.
    - :math:`X` is a known :math:`n \times p` design matrix for :math:`p` fixed effects.
    - :math:`K` is a known :math:`n \times n` positive semi-definite kernel.
    - :math:`I` is the :math:`n \times n` identity matrix.
    - :math:`\beta` is a :math:`p`-parameter vector of fixed effects.
    - :math:`\sigma^2` is the variance parameter on :math:`K`.
    - :math:`\tau^2` is the variance parameter on :math:`I`.

    In particular, the residuals for the :math:`i^\mathit{th}` and :math:`j^\mathit{th}`
    observations have covariance :math:`\sigma^2 K_{ij}` for :math:`i \neq j`.

    This model is equivalent to a
    `mixed model <https://en.wikipedia.org/wiki/Mixed_model>`__
    of the form

    .. math::

        y = X \beta + Z u + \epsilon

    by setting :math:`K = ZZ^T` where

    - :math:`Z` is a known :math:`n \times r` design matrix for :math:`r` random effects.
    - :math:`u` is a :math:`r`-vector of random effects drawn from :math:`\mathrm{N}(0, \sigma^2 I)`.
    - :math:`\epsilon` is a :math:`n`-vector of random errors drawn from :math:`\mathrm{N}(0, \tau^2 I)`.

    However, :class:`LinearMixedModel` does not itself realize :math:`K` as a linear kernel
    with respect to random effects, nor does it take :math:`K` explicitly as input. Rather,
    via the eigendecomposion :math:`K = U S U^T`, the the class leverages a third, decorrelated
    form of the model

    .. math::

        Py \sim \mathrm{N}(PX \beta, \, \sigma^2 (\gamma S + I))

    where

    - :math:`P = U^T: \mathbb{R}^n \rightarrow \mathbb{R}^n` is an orthonormal transformation
      that decorrelates the observations. The rows of :math:`P` are an eigenbasis for :math:`K`.
    - :math:`S` is the :math:`n \times n` diagonal matrix of corresponding eigenvalues.
    - :math:`\gamma = \frac{\sigma^2}{\tau^2}` is the ratio of variance parameters.

    Hence, the triple :math:`(Py, PX, S)` determines the probability
    of the observations for any choice of model parameters, and is
    therefore sufficient for inference.
    This triple, with S encoded as a vector, is the default
    ("full-rank") initialization of the class.

    :class:`LinearMixedModel` also provides an efficient strategy to fit the
    model above with :math:`K` replaced by its rank-:math:`r` approximation
    :math:`K_r = P_r^T S_r P_r` where

    - :math:`P_r: \mathbb{R}^n \rightarrow \mathbb{R}^r` has orthonormal rows
      consisting of the top :math:`r`  eigenvectors of :math:`K`.
    - :math:`S_r` is the :math:`r \times r` diagonal matrix of corresponding
      non-zero eigenvalues.

    For this low-rank model, the quintuple :math:`(P_r y, P_r X, S_r, y, X)`
    is similarly sufficient for inference and corresponds to the "low-rank"
    initialization of the class. Morally, :math:`y` and :math:`X` are
    required for low-rank inference because the diagonal :math:`\gamma S + I`
    is always full-rank.

    If :math:`K` actually has rank :math:`r`, then :math:`K = K_r`
    and the low-rank and full-rank models are equivalent.
    Hence low-rank inference provides a more efficient, equally-exact
    algorithm for fitting the full-rank model.
    This situation arises, for example, when :math:`K` is the linear kernel
    of a mixed model with fewer random effects than observations.

    Even when :math:`K` has full rank, using a lower-rank approximation may
    be an effective from of regularization, in addition to boosting
    computational efficiency.

    **Initialization**

    The class may be initialized directly or with one of two methods:

    - :meth:`from_kinship` takes :math:`y`, :math:`X`, and :math:`K` as ndarrays.
      The model is always full-rank.

    - :meth:`from_random_effects` takes :math:`y` and :math:`X` as ndarrays and
      :math:`Z` as an ndarray or block matrix. The model is full-rank if and
      only if :math:`n \leq m`.

    Direct full-rank initialization takes :math:`Py`, :math:`PX`, and :math:`S`
    as ndarrays. The following class attributes are set:

    .. list-table::
      :header-rows: 1

      * - Attribute
        - Type
        - Value
      * - `low_rank`
        - bool
        - ``False``
      * - `n`
        - int
        - Number of observations :math:`n`
      * - `f`
        - int
        - Number of fixed effects :math:`p`
      * - `r`
        - int
        - Effective number of random effects, must equal :math:`n`
      * - `py`
        - ndarray
        - Rotated response vector :math:`P y` with shape :math:`(n)`
      * - `px`
        - ndarray
        - Rotated design matrix :math:`P X` with shape :math:`(n, p)`
      * - `s`
        - ndarray
        - Eigenvalues vector :math:`S` of :math:`K` with shape :math:`(n)`
      * - `p_path`
        - str
        - Path at which :math:`P` is stored as a block matrix

    Direct low-rank initialization takes :math:`P_r y`, :math:`P_r X`, :math:`S_r`,
    :math:`y`, and :math:`X` as ndarrays. The following class attributes are set:

    .. list-table::
      :header-rows: 1

      * - Attribute
        - Type
        - Value
      * - `low_rank`
        - bool
        - ``True``
      * - `n`
        - int
        - Number of observations :math:`n`
      * - `f`
        - int
        - Number of fixed effects :math:`p`
      * - `r`
        - int
        - Effective number of random effects, must be less than :math:`n`
      * - `py`
        - ndarray
        - Projected response vector :math:`P_r y` with shape :math:`(r)`
      * - `px`
        - ndarray
        - Projected design matrix :math:`P_r X` with shape :math:`(r, p)`
      * - `s`
        - ndarray
        - Eigenvalues vector :math:`S_r` of :math:`K_r` with shape :math:`(r)`
      * - `y`
        - ndarray
        - Response vector with shape :math:`(n)`
      * - `x`
        - ndarray
        - Design matrix with shape :math:`(n, p)`
      * - `p_path`
        - str
        - Path at which :math:`P` is stored as a block matrix

    **Fitting the model**

    :meth:`fit` uses `restricted maximum likelihood
    <https://en.wikipedia.org/wiki/Restricted_maximum_likelihood>`__ (REML)
    to estimate :math:`(\beta, \sigma^2, \tau^2)`.

    This is done by numerical optimization of the univariate function
    :meth:`compute_neg_log_reml`, which itself optimizes REML constrained to a
    fixed ratio of variance parameters. Each evaluation of
    :meth:`compute_neg_log_reml` has computational complexity

    .. math::

      \mathit{O}(rp^2 + p^3).

    :meth:`fit` adds the following attributes at this estimate.

    .. list-table::
      :header-rows: 1

      * - Attribute
        - Type
        - Value
      * - `beta`
        - ndarray
        - :math:`\beta`
      * - `sigma_sq`
        - float
        - :math:`\sigma^2`
      * - `tau_sq`
        - float
        - :math:`\tau^2`
      * - `gamma`
        - float
        - :math:`\gamma = \frac{\sigma^2}{\tau^2}`
      * - `log_gamma`
        - float
        - :math:`\log{\gamma}`
      * - `h_sq`
        - float
        - :math:`\mathit{h}^2 = \frac{\sigma^2}{\sigma^2 + \tau^2}`
      * - `h_sq_standard_error`
        - float
        - asymptotic estimate of :math:`\mathit{h}^2` standard error

    **Testing alternative models**

    The model is also equivalent to its augmentation

    .. math::

        y \sim \mathrm{N}\left(x_\star\beta_\star + X \beta, \, \sigma^2 K + \tau^2 I\right)

    by an additional covariate of interest :math:`x_\star` under the
    null hypothesis that the corresponding fixed effect parameter
    :math:`\beta_\star` is zero. Similarly to initialization, full-rank testing
    of the alternative hypothesis :math:`\beta_\star \neq 0` requires
    :math:`P x_\star`, whereas the low-rank testing requires :math:`P_r x_\star`
    and :math:`x_\star`.

    After running :meth:`fit` to fit the null model, one can test each of a
    collection of alternatives using either of two implementations of the
    likelihood ratio test:

    - :meth:`fit_alternatives_numpy` takes one or two ndarrays. It is a pure Python
      method that evaluates alternatives serially on master.

    - :meth:`fit_alternatives` takes one or two paths to block matrices. It
      evaluates alternatives in parallel on the workers.

    Per alternative, both have computational complexity

    .. math::

      \mathit{O}(rp + p^3).

    Parameters
    ----------
    py: :class:`ndarray`
        Projected response vector :math:`P_r y` with shape :math:`(r)`.
    px: :class:`ndarray`
        Projected design matrix :math:`P_r X` with shape :math:`(r, p)`.
    s: :class:`ndarray`
        Eigenvalues vector :math:`S` with shape :math:`(r)`.
    y: :class:`ndarray`, optional
        Response vector with shape :math:`(n)`.
        Include for low-rank inference.
    x: :class:`ndarray`, optional
        Design matrix with shape :math:`(n, p)`.
        Include for low-rank inference.
    p_path: :obj:`str`, optional
        Path at which :math:`P` has been stored as a block matrix.
    """
    @typecheck_method(py=np.ndarray,
                      px=np.ndarray,
                      s=np.ndarray,
                      y=nullable(np.ndarray),
                      x=nullable(np.ndarray),
                      p_path=nullable(str))
    def __init__(self, py, px, s, y=None, x=None, p_path=None):
        if y is None and x is None:
            low_rank = False
        elif y is not None and x is not None:
            low_rank = True
        else:
            raise ValueError(
                'for low-rank, set both y and x; for full-rank, do not set y or x.'
            )

        _check_dims(py, 'py', 1)
        _check_dims(px, 'px', 2)
        _check_dims(s, 's', 1)

        r = s.size
        f = px.shape[1]

        if py.size != r:
            raise ValueError("py and s must have the same size")
        if px.shape[0] != r:
            raise ValueError(
                "px must have the same number of rows as the size of s")
        if low_rank:
            _check_dims(y, 'y', 1)
            _check_dims(x, 'x', 2)
            n = y.size
            if n <= r:
                raise ValueError("size of y must be larger than the size of s")
            if x.shape[0] != n:
                raise ValueError(
                    "x must have the same number of rows as the size of y")
            if x.shape[1] != f:
                raise ValueError("px and x must have the same number columns")
        else:
            n = r

        if p_path is not None:
            n_rows, n_cols = BlockMatrix.read(p_path).shape
            if n_cols != n:
                raise ValueError(
                    "LinearMixedModel: Number of columns in the block "
                    f"matrix at 'p_path' ({n_cols}) must equal "
                    f"the size of 'y' ({n})")
            if n_rows != r:
                raise ValueError(
                    "LinearMixedModel: Number of rows in the block "
                    f"matrix at 'p_path' ({n_rows}) must equal "
                    f"the size of 'py' ({r})")

        self.low_rank = low_rank
        self.n = n
        self.f = f
        self.r = r
        self.py = py
        self.px = px
        self.s = s
        self.y = y
        self.x = x
        self.p_path = p_path

        self._check_dof()

        self.beta = None
        self.sigma_sq = None
        self.tau_sq = None
        self.gamma = None
        self.log_gamma = None
        self.h_sq = None
        self.h_sq_standard_error = None
        self.optimize_result = None

        self._fitted = False

        if low_rank:
            self._yty = y @ y
            self._xty = x.T @ y
            self._xtx = x.T @ x

        self._dof = n - f
        self._d = None
        self._ydy = None
        self._xdy = None
        self._xdx = None

        self._dof_alt = n - (f + 1)
        self._d_alt = None
        self._ydy_alt = None
        self._xdy_alt = np.zeros(f + 1)
        self._xdx_alt = np.zeros((f + 1, f + 1))

        self._residual_sq = None

        self._scala_model = None

    def _reset(self):
        self._fitted = False

        self.beta = None
        self.sigma_sq = None
        self.tau_sq = None
        self.gamma = None
        self.log_gamma = None
        self.h_sq = None
        self.h_sq_standard_error = None
        self.optimize_result = None

    def compute_neg_log_reml(self, log_gamma, return_parameters=False):
        r"""Compute negative log REML constrained to a fixed value
        of :math:`\log{\gamma}`.

        This function computes the triple :math:`(\beta, \sigma^2, \tau^2)` with
        :math:`\gamma = \frac{\sigma^2}{\tau^2}` at which the restricted
        likelihood is maximized and returns the negative of the restricted log
        likelihood at these parameters (shifted by the constant defined below).

        The implementation has complexity :math:`\mathit{O}(rp^2 + p^3)` and is
        inspired by `FaST linear mixed models for genome-wide association studies (2011)
        <https://www.nature.com/articles/nmeth.1681>`__.

        The formulae follow from `Bayesian Inference for Variance Components Using Only Error Contrasts (1974)
        <http://faculty.dbmi.pitt.edu/day/Bioinf2132-advanced-Bayes-and-R/previousDocuments/Bioinf2132-documents-2016/2016-11-22/Harville-1974.pdf>`__.
        Harville derives that for fixed covariance :math:`V`, the restricted
        likelihood of the variance parameter :math:`V` in the model

        .. math::

          y \sim \mathrm{N}(X \beta, \, V)

        is given by

        .. math::

          (2\pi)^{-\frac{1}{2}(n - p)}
          \det(X^T X)^\frac{1}{2}
          \det(V)^{-\frac{1}{2}}
          \det(X^T V^{-1} X)^{-\frac{1}{2}}
          e^{-\frac{1}{2}(y - X\hat\beta)^T V^{-1}(y - X\hat\beta)}.

        with

        .. math::

          \hat\beta = (X^T V^{-1} X)^{-1} X^T V^{-1} y.

        In our case, the variance is

        .. math::

          V = \sigma^2 K + \tau^2 I = \sigma^2 (K + \gamma^{-1} I)

        which is determined up to scale by any fixed value of the ratio
        :math:`\gamma`. So for input :math:`\log \gamma`, the
        negative restricted log likelihood is minimized at
        :math:`(\hat\beta, \hat\sigma^2)` with :math:`\hat\beta` as above and

        .. math::

           \hat\sigma^2 = \frac{1}{n - p}(y - X\hat\beta)^T (K + \gamma^{-1} I)^{-1}(y - X\hat\beta).

        For :math:`\hat V` at this :math:`(\hat\beta, \hat\sigma^2, \gamma)`,
        the exponent in the likelihood reduces to :math:`-\frac{1}{2}(n-p)`, so
        the negative restricted log likelihood may be expressed as

        .. math::

          \frac{1}{2}\left(\log \det(\hat V) + \log\det(X^T \hat V^{-1} X)\right) + C

        where

        .. math::

          C = \frac{1}{2}\left(n - p + (n - p)\log(2\pi) - \log\det(X^T X)\right)

        only depends on :math:`X`. :meth:`compute_neg_log_reml` returns the value of
        the first term, omitting the constant term.

        Parameters
        ----------
        log_gamma: :obj:`float`
            Value of :math:`\log{\gamma}`.
        return_parameters:
            If ``True``, also return :math:`\beta`, :math:`\sigma^2`,
            and :math:`\tau^2`.

        Returns
        -------
        :obj:`float` or (:obj:`float`, :class:`ndarray`, :obj:`float`, :obj:`float`)
            If `return_parameters` is ``False``, returns (shifted) negative log REML.
            Otherwise, returns (shifted) negative log REML, :math:`\beta`, :math:`\sigma^2`,
            and :math:`\tau^2`.
        """
        from scipy.linalg import solve, LinAlgError

        gamma = np.exp(log_gamma)
        d = 1 / (self.s + 1 / gamma)
        logdet_d = np.sum(np.log(d)) + (self.n - self.r) * log_gamma

        if self.low_rank:
            d -= gamma
            dpy = d * self.py
            ydy = self.py @ dpy + gamma * self._yty
            xdy = self.px.T @ dpy + gamma * self._xty
            xdx = (self.px.T * d) @ self.px + gamma * self._xtx
        else:
            dpy = d * self.py
            ydy = self.py @ dpy
            xdy = self.px.T @ dpy
            xdx = (self.px.T * d) @ self.px

        try:
            beta = solve(xdx, xdy, assume_a='pos')
            residual_sq = ydy - xdy.T @ beta
            sigma_sq = residual_sq / self._dof
            tau_sq = sigma_sq / gamma
            neg_log_reml = (np.linalg.slogdet(xdx)[1] - logdet_d +
                            self._dof * np.log(sigma_sq)) / 2

            self._d, self._ydy, self._xdy, self._xdx = d, ydy, xdy, xdx  # used in fit

            if return_parameters:
                return neg_log_reml, beta, sigma_sq, tau_sq
            else:
                return neg_log_reml
        except LinAlgError as e:
            raise Exception(
                'linear algebra error while solving for REML estimate') from e

    @typecheck_method(log_gamma=nullable(numeric),
                      bounds=tupleof(numeric),
                      tol=float,
                      maxiter=int)
    def fit(self, log_gamma=None, bounds=(-8.0, 8.0), tol=1e-8, maxiter=500):
        r"""Find the triple :math:`(\beta, \sigma^2, \tau^2)` maximizing REML.

        This method sets the attributes `beta`, `sigma_sq`, `tau_sq`, `gamma`,
        `log_gamma`, `h_sq`, and `h_sq_standard_error` as described in the
        top-level class documentation.

        If `log_gamma` is provided, :meth:`fit` finds the REML solution
        with :math:`\log{\gamma}` constrained to this value. In this case,
        `h_sq_standard_error` is ``None`` since `h_sq` is not estimated.

        Otherwise, :meth:`fit` searches for the value of :math:`\log{\gamma}`
        that minimizes :meth:`compute_neg_log_reml`, and also sets the attribute
        `optimize_result` of type `scipy.optimize.OptimizeResult
        <https://docs.scipy.org/doc/scipy/reference/generated/scipy.optimize.OptimizeResult.html>`__.

        Parameters
        ----------
        log_gamma: :obj:`float`, optional
            If provided, the solution is constrained to have this value of
            :math:`\log{\gamma}`.
        bounds: :obj:`float`, :obj:`float`
            Lower and upper bounds for :math:`\log{\gamma}`.
        tol: :obj:`float`
            Absolute tolerance for optimizing :math:`\log{\gamma}`.
        maxiter: :obj:`float`
            Maximum number of iterations for optimizing :math:`\log{\gamma}`.
        """
        if self._fitted:
            self._reset()

        fit_log_gamma = True if log_gamma is None else False

        if fit_log_gamma:
            from scipy.optimize import minimize_scalar

            self.optimize_result = minimize_scalar(self.compute_neg_log_reml,
                                                   method='bounded',
                                                   bounds=bounds,
                                                   options={
                                                       'xatol': tol,
                                                       'maxiter': maxiter
                                                   })

            if self.optimize_result.success:
                if self.optimize_result.x - bounds[0] < 0.001:
                    raise Exception(
                        "failed to fit log_gamma: optimum within 0.001 of lower bound."
                    )
                elif bounds[1] - self.optimize_result.x < 0.001:
                    raise Exception(
                        "failed to fit log_gamma: optimum within 0.001 of upper bound."
                    )
                else:
                    self.log_gamma = self.optimize_result.x
            else:
                raise Exception(
                    f'failed to fit log_gamma:\n  {self.optimize_result}')
        else:
            self.log_gamma = log_gamma

        _, self.beta, self.sigma_sq, self.tau_sq = self.compute_neg_log_reml(
            self.log_gamma, return_parameters=True)

        self.gamma = np.exp(self.log_gamma)
        self.h_sq = self.sigma_sq / (self.sigma_sq + self.tau_sq)

        self._residual_sq = self.sigma_sq * self._dof
        self._d_alt = self._d
        self._ydy_alt = self._ydy
        self._xdy_alt[1:] = self._xdy
        self._xdx_alt[1:, 1:] = self._xdx

        if fit_log_gamma:
            self.h_sq_standard_error = self._estimate_h_sq_standard_error()

        self._fitted = True

    def _estimate_h_sq_standard_error(self):
        epsilon = 1e-4  # parabolic interpolation radius in log_gamma space
        lg = self.log_gamma + np.array([-epsilon, 0.0, epsilon])
        h2 = 1 / (1 + np.exp(-lg))
        nll = [self.compute_neg_log_reml(lgi) for lgi in lg]

        if nll[1] > nll[0] or nll[1] > nll[2]:
            i = 0 if nll[1] > nll[0] else 2
            raise Exception(
                f'Minimum of negative log likelihood fit as {nll[1]} at log_gamma={lg[1]},'
                f'\n    but found smaller value of {nll[i]} at log_gamma={lg[i]}.'
                f'\n    Investigate by plotting the negative log likelihood function.'
            )

        # Asymptotically near MLE, nLL = a * h2^2 + b * h2 + c with a = 1 / (2 * se^2)
        # By Lagrange interpolation:
        a = ((h2[2] * (nll[1] - nll[0]) + h2[1] * (nll[0] - nll[2]) + h2[0] *
              (nll[2] - nll[1])) / ((h2[1] - h2[0]) * (h2[0] - h2[2]) *
                                    (h2[2] - h2[1])))

        return 1 / np.sqrt(2 * a)

    def h_sq_normalized_lkhd(self):
        r"""Estimate the normalized likelihood of :math:`\mathit{h}^2` over the
        discrete grid of percentiles.

        Examples
        --------
        Plot the estimated normalized likelihood function:

        >>> import matplotlib.pyplot as plt                     # doctest: +SKIP
        >>> plt.plot(range(101), model.h_sq_normalized_lkhd())  # doctest: +SKIP

        Notes
        -----
        This method may be used to visualize the approximate posterior on
        :math:`\mathit{h}^2` under a flat prior.

        The resulting ndarray ``a`` has length 101 with ``a[i]`` equal to the
        maximum likelihood over all :math:`\beta` and :math:`\sigma^2` with
        :math:`\mathit{h}^2` constrained to ``i / 100``. The values for
        ``1 <= i <= 99`` are normalized to sum to 1, and ``a[0]`` and ``a[100]``
        are set to ``nan``.

        Returns
        -------
        :class:`ndarray` of :obj:`float64`
            Normalized likelihood values for :math:`\mathit{h}^2`.
        """
        log_lkhd = np.zeros(101, dtype=np.float64)
        log_lkhd[0], log_lkhd[100] = np.nan, np.nan

        for h2 in range(1, 100):
            gamma = h2 / (100.0 - h2)
            log_lkhd[h2] = -self.compute_neg_log_reml(np.log(gamma))

        log_lkhd -= np.max(log_lkhd[1:-1])
        lkhd = np.exp(log_lkhd)
        lkhd /= np.sum(lkhd[1:-1])
        return lkhd

    @typecheck_method(pa_t_path=str,
                      a_t_path=nullable(str),
                      partition_size=nullable(int))
    def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None):
        r"""Fit and test alternative model for each augmented design matrix in parallel.

        Notes
        -----
        The alternative model is fit using REML constrained to the value of
        :math:`\gamma` set by :meth:`fit`.

        The likelihood ratio test of fixed effect parameter :math:`\beta_\star`
        uses (non-restricted) maximum likelihood:

        .. math::

          \chi^2 = 2 \log\left(\frac{
          \max_{\beta_\star, \beta, \sigma^2}\mathrm{N}
          (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)}
          {\max_{\beta, \sigma^2} \mathrm{N}
          (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)}
          \right)

        The p-value is given by the tail probability under a chi-squared
        distribution with one degree of freedom.

        The resulting table has the following fields:

        .. list-table::
          :header-rows: 1

          * - Field
            - Type
            - Value
          * - `idx`
            - int64
            - Index of augmented design matrix.
          * - `beta`
            - float64
            - :math:`\beta_\star`
          * - `sigma_sq`
            - float64
            - :math:`\sigma^2`
          * - `chi_sq`
            - float64
            - :math:`\chi^2`
          * - `p_value`
            - float64
            - p-value

        :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number
        of rows (augmentations). These rows are grouped into partitions for
        parallel processing. The number of partitions equals the ceiling of
        ``n_rows / partition_size``, and should be at least the number or cores
        to make use of all cores. By default, there is one partition per row of
        blocks in :math:`(P_r A)^T`. Setting the partition size to an exact
        (rather than approximate) divisor or multiple of the block size reduces
        superfluous shuffling of data.

        The number of columns in each block matrix must be less than :math:`2^{31}`.

        Warning
        -------
        The block matrices must be stored in row-major format, as results
        from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from
        :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method
        will produce an error message.

        Parameters
        ----------
        pa_t_path: :obj:`str`
            Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`.
            Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a_t_path: :obj:`str`, optional
            Path to block matrix :math:`A^T` with shape :math:`(m, n)`.
            Each row is an augmentation :math:`x_\star` of :math:`X`.
            Include for low-rank inference.
        partition_size: :obj:`int`, optional
            Number of rows to process per partition.
            Default given by block size of :math:`(P_r A)^T`.

        Returns
        -------
        :class:`.Table`
            Table of results for each augmented design matrix.
        """
        from hail.table import Table

        self._check_dof(self.f + 1)

        if self.low_rank and a_t_path is None:
            raise ValueError('model is low-rank so a_t is required.')
        elif not (self.low_rank or a_t_path is None):
            raise ValueError('model is full-rank so a_t must not be set.')

        if self._scala_model is None:
            self._set_scala_model()

        backend = Env.spark_backend('LinearMixedModel.fit_alternatives')
        jfs = backend.fs._jfs

        if partition_size is None:
            block_size = Env.hail().linalg.BlockMatrix.readMetadata(
                jfs, pa_t_path).blockSize()
            partition_size = block_size
        elif partition_size <= 0:
            raise ValueError(
                f'partition_size must be positive, found {partition_size}')

        jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix(
            jfs, pa_t_path, partition_size)

        if a_t_path is None:
            maybe_ja_t = None
        else:
            maybe_ja_t = Env.hail().linalg.RowMatrix.readBlockMatrix(
                jfs, a_t_path, partition_size)

        return Table._from_java(
            backend._jbackend.pyFitLinearMixedModel(self._scala_model, jpa_t,
                                                    maybe_ja_t))

    @typecheck_method(pa=np.ndarray,
                      a=nullable(np.ndarray),
                      return_pandas=bool)
    def fit_alternatives_numpy(self, pa, a=None, return_pandas=False):
        r"""Fit and test alternative model for each augmented design matrix.

        Notes
        -----
        This Python-only implementation runs serially on master. See
        the scalable implementation :meth:`fit_alternatives` for documentation
        of the returned table.

        Parameters
        ----------
        pa: :class:`ndarray`
            Projected matrix :math:`P_r A` of alternatives with shape :math:`(r, m)`.
            Each column is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a: :class:`ndarray`, optional
            Matrix :math:`A` of alternatives with shape :math:`(n, m)`.
            Each column is an augmentation :math:`x_\star` of :math:`X`.
            Required for low-rank inference.
        return_pandas: :obj:`bool`
            If true, return pandas dataframe. If false, return Hail table.

        Returns
        -------
        :class:`.Table` or :class:`.pandas.DataFrame`
            Table of results for each augmented design matrix.
        """
        self._check_dof(self.f + 1)

        if not self._fitted:
            raise Exception("null model is not fit. Run 'fit' first.")

        n_cols = pa.shape[1]
        assert pa.shape[0] == self.r

        if self.low_rank:
            assert a.shape[0] == self.n and a.shape[1] == n_cols
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], a[:, i])
                    for i in range(n_cols)]
        else:
            data = [(i, ) + self._fit_alternative_numpy(pa[:, i], None)
                    for i in range(n_cols)]

        df = pd.DataFrame.from_records(
            data, columns=['idx', 'beta', 'sigma_sq', 'chi_sq', 'p_value'])

        if return_pandas:
            return df
        else:
            return Table.from_pandas(df, key='idx')

    def _fit_alternative_numpy(self, pa, a):
        from scipy.linalg import solve, LinAlgError
        from scipy.stats.distributions import chi2

        gamma = self.gamma
        dpa = self._d_alt * pa

        # single thread => no need to copy
        ydy = self._ydy_alt
        xdy = self._xdy_alt
        xdx = self._xdx_alt

        if self.low_rank:
            xdy[0] = self.py @ dpa + gamma * (self.y @ a)
            xdx[0, 0] = pa @ dpa + gamma * (a @ a)
            xdx[0, 1:] = self.px.T @ dpa + gamma * (self.x.T @ a)
        else:
            xdy[0] = self.py @ dpa
            xdx[0, 0] = pa @ dpa
            xdx[0, 1:] = self.px.T @ dpa

        try:
            beta = solve(xdx, xdy, assume_a='pos')  # only uses upper triangle
            residual_sq = ydy - xdy.T @ beta
            sigma_sq = residual_sq / self._dof_alt
            chi_sq = self.n * np.log(
                self._residual_sq / residual_sq)  # division => precision
            p_value = chi2.sf(chi_sq, 1)

            return beta[0], sigma_sq, chi_sq, p_value
        except LinAlgError:
            return tuple(4 * [float('nan')])

    def _set_scala_model(self):
        from hail.utils.java import Env
        from hail.linalg import _jarray_from_ndarray, _breeze_from_ndarray

        if not self._fitted:
            raise Exception("null model is not fit. Run 'fit' first.")

        self._scala_model = Env.hail().stats.LinearMixedModel.pyApply(
            self.gamma, self._residual_sq, _jarray_from_ndarray(self.py),
            _breeze_from_ndarray(self.px), _jarray_from_ndarray(self._d_alt),
            self._ydy_alt, _jarray_from_ndarray(self._xdy_alt),
            _breeze_from_ndarray(self._xdx_alt),
            _jarray_from_ndarray(self.y) if self.low_rank else None,
            _breeze_from_ndarray(self.x) if self.low_rank else None)

    def _check_dof(self, f=None):
        if f is None:
            f = self.f
        dof = self.n - f
        if dof <= 0:
            raise ValueError(
                f"{self.n} {plural('observation', self.n)} with {f} fixed {plural('effect', f)} "
                f"implies {dof} {plural('degree', dof)} of freedom. Must be positive."
            )

    @classmethod
    @typecheck_method(y=np.ndarray,
                      x=np.ndarray,
                      k=np.ndarray,
                      p_path=nullable(str),
                      overwrite=bool)
    def from_kinship(cls, y, x, k, p_path=None, overwrite=False):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`K`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> k = np.array([[ 1.        , -0.8727875 ,  0.96397335,  0.94512946],
        ...               [-0.8727875 ,  1.        , -0.93036112, -0.97320323],
        ...               [ 0.96397335, -0.93036112,  1.        ,  0.98294169],
        ...               [ 0.94512946, -0.97320323,  0.98294169,  1.        ]])
        >>> model, p = LinearMixedModel.from_kinship(y, x, k)
        >>> model.fit()
        >>> model.h_sq  # doctest: +SKIP_OUTPUT_CHECK
        0.2525148830695317

        >>> model.s  # doctest: +SKIP_OUTPUT_CHECK
        array([3.83501295, 0.13540343, 0.02454114, 0.00504248])

        Truncate to a rank :math:`r=2` model:

        >>> r = 2
        >>> s_r = model.s[:r]
        >>> p_r = p[:r, :]
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)
        >>> model.fit()
        >>> model.h_sq  # doctest: +SKIP_OUTPUT_CHECK
        0.25193197591429695

        Notes
        -----
        This method eigendecomposes :math:`K = P^T S P` on the master and
        returns ``LinearMixedModel(p @ y, p @ x, s)`` and ``p``.

        The performance of eigendecomposition depends critically on the
        number of master cores and the NumPy / SciPy configuration, viewable
        with ``np.show_config()``. For Intel machines, we recommend installing
        the `MKL <https://anaconda.org/anaconda/mkl>`__ package for Anaconda.

        `k` must be positive semi-definite; symmetry is not checked as only the
        lower triangle is used.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects.
        k: :class:`ndarray`
            :math:`n \times n` positive semi-definite kernel :math:`K`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`K`.
        p: :class:`ndarray`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
        """
        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(k, "k", 2)

        n = k.shape[0]
        if k.shape[1] != n:
            raise ValueError("from_kinship: 'k' must be a square matrix")
        if y.shape[0] != n:
            raise ValueError("from_kinship: 'y' and 'k' must have the same "
                             "number of rows")
        if x.shape[0] != n:
            raise ValueError("from_kinship: 'x' and 'k' must have the same "
                             "number of rows")

        s, u = hl.linalg._eigh(k)
        if s[0] < -1e12 * s[-1]:
            raise Exception("from_kinship: smallest eigenvalue of 'k' is"
                            f"negative: {s[0]}")

        # flip singular values to descending order
        s = np.flip(s, axis=0)
        u = np.fliplr(u)
        p = u.T
        if p_path:
            BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)

        model = LinearMixedModel(p @ y, p @ x, s, p_path=p_path)
        return model, p

    @classmethod
    @typecheck_method(y=np.ndarray,
                      x=np.ndarray,
                      z=oneof(np.ndarray, hl.linalg.BlockMatrix),
                      p_path=nullable(str),
                      overwrite=bool,
                      max_condition_number=float,
                      complexity_bound=int)
    def from_random_effects(cls,
                            y,
                            x,
                            z,
                            p_path=None,
                            overwrite=False,
                            max_condition_number=1e-10,
                            complexity_bound=8192):
        r"""Initializes a model from :math:`y`, :math:`X`, and :math:`Z`.

        Examples
        --------
        >>> from hail.stats import LinearMixedModel
        >>> y = np.array([0.0, 1.0, 8.0, 9.0])
        >>> x = np.array([[1.0, 0.0],
        ...               [1.0, 2.0],
        ...               [1.0, 1.0],
        ...               [1.0, 4.0]])
        >>> z = np.array([[0.0, 0.0, 1.0],
        ...               [0.0, 1.0, 2.0],
        ...               [1.0, 2.0, 4.0],
        ...               [2.0, 4.0, 8.0]])
        >>> model, p = LinearMixedModel.from_random_effects(y, x, z)
        >>> model.fit()
        >>> model.h_sq  # doctest: +SKIP_OUTPUT_CHECK
        0.38205307244271675

        Notes
        -----
        If :math:`n \leq m`, the returned model is full rank.

        If :math:`n > m`, the returned model is low rank. In this case only,
        eigenvalues less than or equal to `max_condition_number` times the top
        eigenvalue are dropped from :math:`S`, with the corresponding
        eigenvectors dropped from :math:`P`. This guards against precision
        loss on left eigenvectors computed via the right gramian :math:`Z^T Z`
        in :meth:`BlockMatrix.svd`.

        In either case, one can truncate to a rank :math:`r` model as follows.
        If `p` is an ndarray:

        >>> p_r = p[:r, :]     # doctest: +SKIP
        >>> s_r = model.s[:r]  # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x)  # doctest: +SKIP

        If `p` is a block matrix:

        >>> p[:r, :].write(p_r_path)          # doctest: +SKIP
        >>> p_r = BlockMatrix.read(p_r_path)  # doctest: +SKIP
        >>> s_r = model.s[:r]                 # doctest: +SKIP
        >>> model_r = LinearMixedModel(p_r @ y, p_r @ x, s_r, y, x, p_r_path)  # doctest: +SKIP

        This method applies no standardization to `z`.

        Warning
        -------
        If `z` is a block matrix, then ideally `z` should be the result of
        directly reading from disk (and possibly a transpose). This is most
        critical if :math:`n > m`, because in this case multiplication by `z`
        will result in all preceding transformations being repeated
        ``n / block_size`` times, as explained in :class:`.BlockMatrix`.

        At least one dimension must be less than or equal to 46300.
        See the warning in :meth:`.BlockMatrix.svd` for performance
        considerations.

        Parameters
        ----------
        y: :class:`ndarray`
            :math:`n` vector of observations :math:`y`.
        x: :class:`ndarray`
            :math:`n \times p` matrix of fixed effects :math:`X`.
        z: :class:`ndarray` or :class:`BlockMatrix`
            :math:`n \times m` matrix of random effects :math:`Z`.
        p_path: :obj:`str`, optional
            Path at which to write :math:`P` as a block matrix.
            Required if `z` is a block matrix.
        overwrite: :obj:`bool`
            If ``True``, overwrite an existing file at `p_path`.
        max_condition_number: :obj:`float`
            Maximum condition number. Must be greater than 1e-16.
        complexity_bound: :obj:`int`
            Complexity bound for :meth:`.BlockMatrix.svd` when `z` is a block
            matrix.

        Returns
        -------
        model: :class:`LinearMixedModel`
            Model constructed from :math:`y`, :math:`X`, and :math:`Z`.
        p: :class:`ndarray` or :class:`.BlockMatrix`
            Matrix :math:`P` whose rows are the eigenvectors of :math:`K`.
            The type is block matrix if `z` is a block matrix and
            :meth:`.BlockMatrix.svd` of `z` returns :math:`U` as a block matrix.
        """
        z_is_bm = isinstance(z, BlockMatrix)

        if z_is_bm and p_path is None:
            raise ValueError("from_random_effects: 'p_path' required when 'z'"
                             "is a block matrix.")

        if max_condition_number < 1e-16:
            raise ValueError(
                "from_random_effects: 'max_condition_number' must "
                f"be at least 1e-16, found {max_condition_number}")

        _check_dims(y, "y", 1)
        _check_dims(x, "x", 2)
        _check_dims(z, "z", 2)

        n, m = z.shape

        if y.shape[0] != n:
            raise ValueError("from_random_effects: 'y' and 'z' must have the "
                             "same number of rows")
        if x.shape[0] != n:
            raise ValueError("from_random_effects: 'x' and 'z' must have the "
                             "same number of rows")

        if z_is_bm:
            u, s0, _ = z.svd(complexity_bound=complexity_bound)
            p = u.T
            p_is_bm = isinstance(p, BlockMatrix)
        else:
            u, s0, _ = hl.linalg._svd(z, full_matrices=False)
            p = u.T
            p_is_bm = False

        s = s0**2

        low_rank = n > m

        if low_rank:
            assert np.all(np.isfinite(s))
            r = int(np.searchsorted(-s, -max_condition_number * s[0]))
            if r < m:
                info(
                    f'from_random_effects: model rank reduced from {m} to {r} '
                    f'due to ill-condition.'
                    f'\n    Largest dropped eigenvalue was {s[r]}.')
            s = s[:r]
            p = p[:r, :]

        if p_path is not None:
            if p_is_bm:
                p.write(p_path, overwrite=overwrite)
                p = BlockMatrix.read(p_path)
            else:
                BlockMatrix.from_numpy(p).write(p_path, overwrite=overwrite)
        if p_is_bm:
            py, px = (p @ y.reshape(n, 1)).to_numpy().flatten(), (
                p @ x).to_numpy()
        else:
            py, px = p @ y, p @ x

        if low_rank:
            model = LinearMixedModel(py, px, s, y, x, p_path)
        else:
            model = LinearMixedModel(py, px, s, p_path=p_path)

        return model, p

    # checks agreement of model initialization
    def _same(self, other, tol=1e-6, up_to_sign=True):
        def same_rows_up_to_sign(a, b, atol):
            assert a.shape[0] == b.shape[0]
            return all(
                np.allclose(a[i], b[i], atol=atol)
                or np.allclose(-a[i], b[i], atol=atol)
                for i in range(a.shape[0]))

        close = same_rows_up_to_sign if up_to_sign else np.allclose

        if self.low_rank != other.low_rank:
            print(f'different low_rank: {self.low_rank}, {other.low_rank}')
            return False

        same = True
        if not close(self.py, other.py, atol=tol):
            print(f'different py:\n{self.py}\n{other.py}')
            same = False
        if not close(self.px, other.px, atol=tol):
            print(f'different px:\n{self.px}\n{other.px}')
            same = False
        if not np.allclose(self.s, other.s, atol=tol):
            print(f'different s:\n{self.s}\n{other.s}')
            same = False
        if self.low_rank and not close(self.y, other.y, atol=tol):
            print(f'different y:\n{self.y}\n{other.y}')
            same = False
        if self.low_rank and not close(self.x, other.x, atol=tol):
            print(f'different x\n{self.x}\n{other.x}')
            same = False
        if self.p_path != other.p_path:
            print(f'different p_path:\n{self.p_path}\n{other.p_path}')
            same = False
        return same
Ejemplo n.º 17
0
    @property
    def default_reference(self):
        return self._default_ref

    def stop(self):
        self._backend.stop()
        self._backend = None
        Env._hc = None
        Env._dummy_table = None
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38', 'CanFam3'),
           idempotent=bool,
           global_seed=nullable(int),
           spark_conf=nullable(dictof(str, str)),
           skip_logging_configuration=bool,
           local_tmpdir=nullable(str),
Ejemplo n.º 18
0
Archivo: nd.py Proyecto: saponas/hail
    Returns
    -------
    :class:`.NDArrayExpression`
        An ndarray based on the input array.
    """
    return _ndarray(input_array, dtype=dtype)


@typecheck(a=expr_array(), shape=shape_type)
def from_column_major(a, shape):
    assert len(shape) == 2
    return array(a).reshape(tuple(reversed(shape))).T


@typecheck(start=expr_int32, stop=nullable(expr_int32), step=expr_int32)
def arange(start, stop=None, step=1) -> NDArrayNumericExpression:
    """Returns a 1-dimensions ndarray of integers from `start` to `stop` by `step`.

    Examples
    --------

    >>> hl.eval(hl.nd.arange(10))
    array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=int32)

    >>> hl.eval(hl.nd.arange(3, 10))
    array([3, 4, 5, 6, 7, 8, 9], dtype=int32)

    >>> hl.eval(hl.nd.arange(0, 10, step=3))
    array([0, 3, 6, 9], dtype=int32)
Ejemplo n.º 19
0
import hail as hl
from hail.typecheck import typecheck, oneof, nullable
from hail.expr.expressions import expr_float64, expr_int32, expr_array, expr_call
from hail.matrixtable import MatrixTable
from hail.table import Table
from hail.utils.java import Env
import numpy as np
import pandas as pd
import scipy.stats as stats


@typecheck(mt=MatrixTable,
           genotype=oneof(expr_int32, expr_float64, expr_call),
           h2=(oneof(float, int, list, np.ndarray)),
           pi=nullable(oneof(float, int, list, np.ndarray)),
           rg=nullable(oneof(float, int, list, np.ndarray)),
           annot=nullable(oneof(expr_float64, expr_int32)),
           popstrat=nullable(oneof(expr_int32, expr_float64)),
           popstrat_var=nullable(oneof(float, int)),
           exact_h2=bool)
def simulate_phenotypes(mt,
                        genotype,
                        h2,
                        pi=None,
                        rg=None,
                        annot=None,
                        popstrat=None,
                        popstrat_var=None,
                        exact_h2=False):
    r"""Simulate phenotypes for testing LD score regression.
Ejemplo n.º 20
0

@typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool)
def block_matrices_tofiles(bms: List[BlockMatrix],
                           prefix: str,
                           overwrite: bool = False):
    writer = BlockMatrixBinaryMultiWriter(prefix, overwrite)
    Env.backend().execute(
        BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))


@typecheck(bms=sequenceof(BlockMatrix),
           prefix=str,
           overwrite=bool,
           delimiter=str,
           header=nullable(str),
           add_index=bool,
           compression=nullable(enumeration('gz', 'bgz')),
           custom_filenames=nullable(sequenceof(str)))
def export_block_matrices(bms: List[BlockMatrix],
                          prefix: str,
                          overwrite: bool = False,
                          delimiter: str = '\t',
                          header: Optional[str] = None,
                          add_index: bool = False,
                          compression: Optional[str] = None,
                          custom_filenames=None):

    if custom_filenames:
        assert len(custom_filenames) == len(
            bms
Ejemplo n.º 21
0
    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None
        Env._seed_generator = None

    def upload_log(self):
        self._jhc.uploadLog()

@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38'),
           idempotent=bool,
           global_seed=nullable(int),
           _backend=nullable(Backend))
def init(sc=None, app_name='Hail', master=None, local='local[*]',
         log=None, quiet=False, append=False,
Ejemplo n.º 22
0
class HailContext(object):
    @typecheck_method(sc=nullable(SparkContext),
                      app_name=str,
                      master=nullable(str),
                      local=str,
                      log=nullable(str),
                      quiet=bool,
                      append=bool,
                      min_block_size=int,
                      branching_factor=int,
                      tmp_dir=nullable(str),
                      default_reference=str,
                      idempotent=bool,
                      global_seed=nullable(int),
                      _backend=nullable(Backend))
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log=None,
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 idempotent=False,
                 global_seed=6348563392232659379,
                 _backend=None):

        if Env._hc:
            if idempotent:
                return
            else:
                raise FatalError(
                    'Hail has already been initialized, restart session '
                    'or stop Hail to change configuration.')

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            sys.stderr.write(f'using hail jar at {hail_jar_path}\n')
            conf = SparkConf()
            conf.set('spark.driver.extraClassPath', hail_jar_path)
            conf.set('spark.executor.extraClassPath', hail_jar_path)
            SparkContext._ensure_initialized(conf=conf)
        else:
            SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        if _backend is None:
            _backend = SparkBackend()
        self._backend = _backend

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')

        version = read_version_info()
        hail.__version__ = version

        if log is None:
            log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
                                            suffix=f'-{version}.log')
        self._log = log

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        # if idempotent:
        if idempotent:
            self._jhc = self._hail.HailContext.getOrCreate(
                jsc, app_name, joption(master), local, log, True, append,
                min_block_size, branching_factor, tmp_dir)
        else:
            self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                     joption(master), local,
                                                     log, True, append,
                                                     min_block_size,
                                                     branching_factor, tmp_dir)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        self._default_ref = None
        Env.hail().variant.ReferenceGenome.setDefaultReference(
            self._jhc, default_reference)

        jar_version = self._jhc.version()

        if jar_version != version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {version}")

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        install_exception_handler()
        Env.set_seed(global_seed)

    @property
    def default_reference(self):
        if not self._default_ref:
            self._default_ref = ReferenceGenome._from_java(
                Env.hail().variant.ReferenceGenome.defaultReference())
        return self._default_ref

    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None
        Env._seed_generator = None

    def upload_log(self):
        self._jhc.uploadLog()
Ejemplo n.º 23
0
            self._default_ref = ReferenceGenome._from_java(
                Env.hail().variant.ReferenceGenome.defaultReference())
        return self._default_ref

    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=str,
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38'),
           force_ir=bool)
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
Ejemplo n.º 24
0
class Trio(object):
    """Class containing information about nuclear family relatedness and sex.

    :param str s: Sample ID of proband.

    :param fam_id: Family ID.
    :type fam_id: str or None

    :param pat_id: Sample ID of father.
    :type pat_id: str or None

    :param mat_id: Sample ID of mother.
    :type mat_id: str or None

    :param is_female: Sex of proband.
    :type is_female: bool or None
    """
    @typecheck_method(s=str,
                      fam_id=nullable(str),
                      pat_id=nullable(str),
                      mat_id=nullable(str),
                      is_female=nullable(bool))
    def __init__(self,
                 s,
                 fam_id=None,
                 pat_id=None,
                 mat_id=None,
                 is_female=None):

        self._fam_id = fam_id
        self._s = s
        self._pat_id = pat_id
        self._mat_id = mat_id
        self._is_female = is_female

    def __repr__(self):
        return 'Trio(s=%s, fam_id=%s, pat_id=%s, mat_id=%s, is_female=%s)' % (
            repr(self.s), repr(self.fam_id), repr(
                self.pat_id), repr(self.mat_id), repr(self.is_female))

    def __str__(self):
        return 'Trio(s=%s, fam_id=%s, pat_id=%s, mat_id=%s, is_female=%s)' % (
            str(self.s), str(self.fam_id), str(self.pat_id), str(
                self.mat_id), str(self.is_female))

    def __eq__(self, other):
        return (isinstance(other, Trio) and self._s == other._s
                and self._mat_id == other._mat_id
                and self._pat_id == other._pat_id
                and self._fam_id == other._fam_id
                and self._is_female == other._is_female)

    def __hash__(self):
        return hash((self._s, self._pat_id, self._mat_id, self._fam_id,
                     self._is_female))

    @property
    def s(self):
        """ID of proband in trio, never missing.

        :rtype: str
        """

        return self._s

    @property
    def pat_id(self):
        """ID of father in trio, may be missing.

        :rtype: str or None
        """

        return self._pat_id

    @property
    def mat_id(self):
        """ID of mother in trio, may be missing.

        :rtype: str or None
        """

        return self._mat_id

    @property
    def fam_id(self):
        """Family ID.

        :rtype: str or None
        """

        return self._fam_id

    @property
    def is_male(self):
        """Returns ``True`` if the proband is a reported male,
        ``False`` if reported female, and ``None`` if no sex is defined.

        :rtype: bool or None
        """

        if self._is_female is None:
            return None

        return self._is_female is False

    @property
    def is_female(self):
        """Returns ``True`` if the proband is a reported female,
        ``False`` if reported male, and ``None`` if no sex is defined.

        :rtype: bool or None
        """

        if self._is_female is None:
            return None

        return self._is_female is True

    def is_complete(self):
        """Returns True if the trio has a defined mother and father.

        The considered fields are :meth:`mat_id` and :meth:`pat_id`.
        Recall that ``s`` may never be missing. The :meth:`fam_id`
        and :meth:`is_female` fields may be missing in a complete trio.

        :rtype: bool
        """

        return self._pat_id is not None and self._mat_id is not None

    def _restrict_to(self, ids):
        if self._s not in ids:
            return None

        return Trio(self._s, self._fam_id,
                    self._pat_id if self._pat_id in ids else None,
                    self._mat_id if self._mat_id in ids else None,
                    self._is_female)

    def _sex_as_numeric_string(self):
        if self._is_female is None:
            return "0"
        return "2" if self.is_female else "1"

    def _to_fam_file_line(self):
        def sample_id_or_else_zero(sample_id):
            if sample_id is None:
                return "0"
            return sample_id

        line_list = [
            sample_id_or_else_zero(self._fam_id), self._s,
            sample_id_or_else_zero(self._pat_id),
            sample_id_or_else_zero(self._mat_id),
            self._sex_as_numeric_string(), "0"
        ]
        return "\t".join(line_list)
Ejemplo n.º 25
0
class HailContext(object):
    @typecheck_method(sc=nullable(SparkContext),
                      app_name=str,
                      master=nullable(str),
                      local=str,
                      log=str,
                      quiet=bool,
                      append=bool,
                      min_block_size=int,
                      branching_factor=int,
                      tmp_dir=nullable(str),
                      default_reference=str,
                      force_ir=bool)
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log='hail.log',
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 force_ir=False):

        if Env._hc:
            raise FatalError(
                'Hail has already been initialized, restart session '
                'or stop Hail to change configuration.')

        SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                 joption(master), local, log,
                                                 True, append, min_block_size,
                                                 branching_factor, tmp_dir,
                                                 force_ir)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)
        self._counter = 1

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        self._default_ref = None
        Env.hail().variant.ReferenceGenome.setDefaultReference(
            self._jhc, default_reference)

        version = self._jhc.version()
        hail.__version__ = version

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')

        install_exception_handler()

    @property
    def default_reference(self):
        if not self._default_ref:
            self._default_ref = ReferenceGenome._from_java(
                Env.hail().variant.ReferenceGenome.defaultReference())
        return self._default_ref

    def stop(self):
        Env.hail().HailContext.clear()
        self.sc.stop()
        self.sc = None
        Env._jvm = None
        Env._gateway = None
        Env._hc = None
        uninstall_exception_handler()
        Env._dummy_table = None
Ejemplo n.º 26
0
import hail
from hail.utils.java import Env, joption, error
from hail.typecheck import enumeration, typecheck, nullable
import difflib
from collections import defaultdict, Counter, OrderedDict
import atexit
import shutil
import tempfile

@typecheck(n_rows=int, n_cols=int, n_partitions=nullable(int))
def range_matrix_table(n_rows, n_cols, n_partitions=None) -> 'hail.MatrixTable':
    """Construct a matrix table with row and column indices and no entry fields.

    Examples
    --------

    >>> range_ds = hl.utils.range_matrix_table(n_rows=100, n_cols=10)

    >>> range_ds.count_rows()
    100

    >>> range_ds.count_cols()
    10

    Notes
    -----
    The resulting matrix table contains the following fields:

     - `row_idx` (:py:data:`.tint32`) - Row index (row key).
     - `col_idx` (:py:data:`.tint32`) - Column index (column key).
Ejemplo n.º 27
0
class HailContext(object):
    @typecheck_method(log=str,
                      quiet=bool,
                      append=bool,
                      tmpdir=str,
                      local_tmpdir=str,
                      default_reference=str,
                      global_seed=nullable(int),
                      backend=Backend)
    def __init__(self, log, quiet, append, tmpdir, local_tmpdir,
                 default_reference, global_seed, backend):
        assert not Env._hc

        super(HailContext, self).__init__()

        self._log = log

        self._tmpdir = tmpdir
        self._local_tmpdir = local_tmpdir

        self._backend = backend

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._hc = self

        ReferenceGenome._from_config(self._backend.get_reference('GRCh37'),
                                     True)
        ReferenceGenome._from_config(self._backend.get_reference('GRCh38'),
                                     True)
        ReferenceGenome._from_config(self._backend.get_reference('GRCm38'),
                                     True)
        ReferenceGenome._from_config(self._backend.get_reference('CanFam3'),
                                     True)

        if default_reference in ReferenceGenome._references:
            self._default_ref = ReferenceGenome._references[default_reference]
        else:
            self._default_ref = ReferenceGenome.read(default_reference)

        if not quiet:
            py_version = version()
            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(py_version))

            if py_version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        if global_seed is None:
            global_seed = 6348563392232659379
        Env.set_seed(global_seed)

    @property
    def default_reference(self):
        return self._default_ref

    def stop(self):
        self._backend.stop()
        self._backend = None
        Env._hc = None
        Env._dummy_table = None
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}
Ejemplo n.º 28
0
class Expression(object):
    """Base class for Hail expressions."""

    __array_ufunc__ = None  # disable NumPy coercions, so Hail coercions take priority

    @typecheck_method(x=ir.IR,
                      type=nullable(HailType),
                      indices=Indices,
                      aggregations=linked_list(Aggregation))
    def __init__(self,
                 x: ir.IR,
                 type: HailType,
                 indices: Indices = Indices(),
                 aggregations: LinkedList = LinkedList(Aggregation)):

        self._ir: ir.IR = x
        self._type = type
        self._indices = indices
        self._aggregations = aggregations
        self._summary = None

    def describe(self, handler=print):
        """Print information about type, index, and dependencies."""
        if self._aggregations:
            agg_indices = set()
            for a in self._aggregations:
                agg_indices = agg_indices.union(a.indices.axes)
            agg_tag = ' (aggregated)'
            agg_str = f'Includes aggregation with index {list(agg_indices)}\n' \
                      f'    (Aggregation index may be promoted based on context)'
        else:
            agg_tag = ''
            agg_str = ''

        bar = '--------------------------------------------------------'
        s = '{bar}\n' \
            'Type:\n' \
            '    {t}\n' \
            '{bar}\n' \
            'Source:\n' \
            '    {src}\n' \
            'Index:\n' \
            '    {inds}{agg_tag}{maybe_bar}{agg}\n' \
            '{bar}'.format(bar=bar,
                           t=self.dtype.pretty(indent=4),
                           src=self._indices.source,
                           inds=list(self._indices.axes),
                           maybe_bar='\n' + bar + '\n' if agg_str else '',
                           agg_tag=agg_tag,
                           agg=agg_str)
        handler(s)

    def __lt__(self, other):
        return self._compare_op("<", other)

    def __le__(self, other):
        return self._compare_op("<=", other)

    def __gt__(self, other):
        return self._compare_op(">", other)

    def __ge__(self, other):
        return self._compare_op(">=", other)

    def __nonzero__(self):
        raise ExpressionException(
            "The truth value of an expression is undefined\n"
            "    Hint: instead of 'if x', use 'hl.if_else(x, ...)'\n"
            "    Hint: instead of 'x and y' or 'x or y', use 'x & y' or 'x | y'\n"
            "    Hint: instead of 'not x', use '~x'")

    def __iter__(self):
        raise ExpressionException(f"{repr(self)} object is not iterable")

    def _compare_op(self, op, other):
        other = to_expr(other)
        left, right, success = unify_exprs(self, other)
        if not success:
            raise TypeError(
                f"Invalid '{op}' comparison, cannot compare expressions "
                f"of type '{self.dtype}' and '{other.dtype}'")
        res = left._bin_op(op, right, hl.tbool)
        return res

    def _is_scalar(self):
        return self._indices.source is None

    def _promote_scalar(self, typ):
        if typ == tint32:
            return hail.int32(self)
        elif typ == tint64:
            return hail.int64(self)
        elif typ == tfloat32:
            return hail.float32(self)
        else:
            assert typ == tfloat64
            return hail.float64(self)

    def _promote_numeric(self, typ):
        coercer = expressions.coercer_from_dtype(typ)
        if isinstance(typ, tarray) and not isinstance(self.dtype, tarray):
            return coercer.ec.coerce(self)
        elif isinstance(typ,
                        tndarray) and not isinstance(self.dtype, tndarray):
            return coercer.ec.coerce(self)
        else:
            return coercer.coerce(self)

    @staticmethod
    def _div_ret_type_f(t):
        assert is_numeric(t)
        if t == tint32 or t == tint64:
            return tfloat64
        else:
            # Float64 or Float32
            return t

    def _bin_op_numeric_unify_types(self, name, other):
        def numeric_proxy(t):
            if t == tbool:
                return tint32
            else:
                return t

        def scalar_type(t):
            if isinstance(t, tarray):
                return numeric_proxy(t.element_type)
            elif isinstance(t, tndarray):
                return numeric_proxy(t.element_type)
            else:
                return numeric_proxy(t)

        t = unify_types(scalar_type(self.dtype), scalar_type(other.dtype))
        if t is None:
            raise NotImplementedError("'{}' {} '{}'".format(
                self.dtype, name, other.dtype))

        if isinstance(self.dtype, tarray) or isinstance(other.dtype, tarray):
            return tarray(t)
        elif isinstance(self.dtype, tndarray):
            return tndarray(t, self.ndim)
        elif isinstance(other.dtype, tndarray):
            return tndarray(t, other.ndim)

        return t

    def _bin_op_numeric(self, name, other, ret_type_f=None):
        other = to_expr(other)
        unified_type = self._bin_op_numeric_unify_types(name, other)
        me = self._promote_numeric(unified_type)
        other = other._promote_numeric(unified_type)
        if ret_type_f:
            if isinstance(unified_type, tarray):
                ret_type = tarray(ret_type_f(unified_type.element_type))
            elif isinstance(unified_type, tndarray):
                ret_type = tndarray(ret_type_f(unified_type.element_type),
                                    unified_type.ndim)
            else:
                ret_type = ret_type_f(unified_type)
        else:
            ret_type = unified_type
        return me._bin_op(name, other, ret_type)

    def _bin_op_numeric_reverse(self, name, other, ret_type_f=None):
        return to_expr(other)._bin_op_numeric(name, self, ret_type_f)

    def _unary_op(self, name):
        return expressions.construct_expr(ir.ApplyUnaryPrimOp(name, self._ir),
                                          self._type, self._indices,
                                          self._aggregations)

    def _bin_op(self, name, other, ret_type):
        other = to_expr(other)
        indices, aggregations = unify_all(self, other)
        if (name in {'+', '-', '*', '/', '//'
                     }) and (ret_type in {tint32, tint64, tfloat32, tfloat64}):
            op = ir.ApplyBinaryPrimOp(name, self._ir, other._ir)
        elif name in {"==", "!=", "<", "<=", ">", ">="}:
            op = ir.ApplyComparisonOp(name, self._ir, other._ir)
        else:
            d = {
                '+': 'add',
                '-': 'sub',
                '*': 'mul',
                '/': 'div',
                '//': 'floordiv',
                '%': 'mod',
                '**': 'pow'
            }
            op = ir.Apply(d.get(name, name), ret_type, self._ir, other._ir)
        return expressions.construct_expr(op, ret_type, indices, aggregations)

    def _bin_op_reverse(self, name, other, ret_type):
        return to_expr(other)._bin_op(name, self, ret_type)

    def _method(self, name, ret_type, *args):
        args = tuple(to_expr(arg) for arg in args)
        indices, aggregations = unify_all(self, *args)
        x = ir.Apply(name, ret_type, self._ir, *(a._ir for a in args))
        return expressions.construct_expr(x, ret_type, indices, aggregations)

    def _index(self, ret_type, key):
        key = to_expr(key)
        return self._method("index", ret_type, key)

    def _ir_lambda_method(self, irf, f, input_type, ret_type_f, *args):
        args = (to_expr(arg)._ir for arg in args)
        new_id = Env.get_uid()
        lambda_result = to_expr(
            f(
                expressions.construct_variable(new_id, input_type,
                                               self._indices,
                                               self._aggregations)))

        indices, aggregations = unify_all(self, lambda_result)
        x = irf(self._ir, new_id, lambda_result._ir, *args)
        return expressions.construct_expr(x, ret_type_f(lambda_result._type),
                                          indices, aggregations)

    def _ir_lambda_method2(self, other, irf, f, input_type1, input_type2,
                           ret_type_f, *args):
        args = (to_expr(arg)._ir for arg in args)
        new_id1 = Env.get_uid()
        new_id2 = Env.get_uid()
        lambda_result = to_expr(
            f(
                expressions.construct_variable(new_id1, input_type1,
                                               self._indices,
                                               self._aggregations),
                expressions.construct_variable(new_id2, input_type2,
                                               other._indices,
                                               other._aggregations)))
        indices, aggregations = unify_all(self, other, lambda_result)
        x = irf(self._ir, other._ir, new_id1, new_id2, lambda_result._ir,
                *args)
        return expressions.construct_expr(x, ret_type_f(lambda_result._type),
                                          indices, aggregations)

    @property
    def dtype(self) -> HailType:
        """The data type of the expression.

        Returns
        -------
        :class:`.HailType`

        """
        return self._type

    def __bool__(self):
        raise TypeError(
            "'Expression' objects cannot be converted to a 'bool'. Use 'hl.if_else' instead of Python if statements."
        )

    def __len__(self):
        raise TypeError(
            "'Expression' objects have no static length: use 'hl.len' for the length of collections"
        )

    def __contains__(self, item):
        class_name = type(self).__name__
        raise TypeError(
            f"`{class_name}` objects don't support the `in` operator.")

    def __hash__(self):
        return super(Expression, self).__hash__()

    def __repr__(self):
        return f'<{self.__class__.__name__} of type {self.dtype}>'

    def __eq__(self, other):
        """Returns ``True`` if the two expressions are equal.

        Examples
        --------

        >>> x = hl.literal(5)
        >>> y = hl.literal(5)
        >>> z = hl.literal(1)

        >>> hl.eval(x == y)
        True

        >>> hl.eval(x == z)
        False

        Notes
        -----
        This method will fail with an error if the two expressions are not
        of comparable types.

        Parameters
        ----------
        other : :class:`.Expression`
            Expression for equality comparison.

        Returns
        -------
        :class:`.BooleanExpression`
            ``True`` if the two expressions are equal.
        """
        return self._compare_op("==", other)

    def __ne__(self, other):
        """Returns ``True`` if the two expressions are not equal.

        Examples
        --------

        >>> x = hl.literal(5)
        >>> y = hl.literal(5)
        >>> z = hl.literal(1)

        >>> hl.eval(x != y)
        False

        >>> hl.eval(x != z)
        True

        Notes
        -----
        This method will fail with an error if the two expressions are not
        of comparable types.

        Parameters
        ----------
        other : :class:`.Expression`
            Expression for inequality comparison.

        Returns
        -------
        :class:`.BooleanExpression`
            ``True`` if the two expressions are not equal.
        """
        return self._compare_op("!=", other)

    def _to_table(self, name):
        name, ds = self._to_relational(name)
        if isinstance(ds, hail.MatrixTable):
            entries = ds.key_cols_by().entries()
            entries = entries.order_by(*ds.row_key)
            return name, entries.select(name)
        else:
            if len(ds.key) != 0:
                ds = ds.order_by(*ds.key)
            return name, ds.select(name)

    def _to_relational(self, fallback_name):
        source = self._indices.source
        axes = self._indices.axes
        if not self._aggregations.empty():
            raise NotImplementedError(
                'cannot convert aggregated expression to table')

        if source is None:
            return fallback_name, hl.Table.parallelize(
                [hl.struct(**{fallback_name: self})], n_partitions=1)

        name = source._fields_inverse.get(self)
        top_level = name is not None
        if not top_level:
            name = fallback_name
        named_self = {name: self}
        if len(axes) == 0:
            x = source.select_globals(**named_self)
            ds = hl.Table.parallelize([x.index_globals()], n_partitions=1)
        elif isinstance(source, hail.Table):
            if top_level and name in source.key:
                named_self = {}
            ds = source.select(**named_self).select_globals()
        elif isinstance(source, hail.MatrixTable):
            if self._indices == source._row_indices:
                if top_level and name in source.row_key:
                    named_self = {}
                ds = source.select_rows(**named_self).select_globals().rows()
            elif self._indices == source._col_indices:
                if top_level and name in source.col_key:
                    named_self = {}
                ds = source.select_cols(
                    **named_self).select_globals().key_cols_by().cols()
            else:
                assert self._indices == source._entry_indices
                ds = source.select_entries(
                    **named_self).select_globals().select_cols().select_rows()
        return name, ds

    @typecheck_method(n=nullable(int),
                      width=nullable(int),
                      truncate=nullable(int),
                      types=bool,
                      handler=nullable(anyfunc),
                      n_rows=nullable(int),
                      n_cols=nullable(int))
    def show(self,
             n=None,
             width=None,
             truncate=None,
             types=True,
             handler=None,
             n_rows=None,
             n_cols=None):
        """Print the first few records of the expression to the console.

        If the expression refers to a value on a keyed axis of a table or matrix
        table, then the accompanying keys will be shown along with the records.

        Examples
        --------

        >>> table1.SEX.show()
        +-------+-----+
        |    ID | SEX |
        +-------+-----+
        | int32 | str |
        +-------+-----+
        |     1 | "M" |
        |     2 | "M" |
        |     3 | "F" |
        |     4 | "F" |
        +-------+-----+

        >>> hl.literal(123).show()
        +--------+
        | <expr> |
        +--------+
        |  int32 |
        +--------+
        |    123 |
        +--------+

        Notes
        -----
        The output can be passed piped to another output source using the `handler` argument:

        >>> ht.foo.show(handler=lambda x: logging.info(x))  # doctest: +SKIP


        Parameters
        ----------
        n : :obj:`int`
            Maximum number of rows to show.
        width : :obj:`int`
            Horizontal width at which to break columns.
        truncate : :obj:`int`, optional
            Truncate each field to the given number of characters. If
            ``None``, truncate fields to the given `width`.
        types : :obj:`bool`
            Print an extra header line with the type of each field.
        """
        kwargs = {
            'n': n,
            'width': width,
            'truncate': truncate,
            'types': types,
            'handler': handler,
            'n_rows': n_rows,
            'n_cols': n_cols
        }
        if kwargs.get('n_rows') is None:
            kwargs['n_rows'] = kwargs['n']
        del kwargs['n']
        _, ds = self._to_relational_preserving_rows_and_cols('<expr>')
        return ds.show(**{k: v for k, v in kwargs.items() if v is not None})

    def _to_relational_preserving_rows_and_cols(self, fallback_name):
        source = self._indices.source
        if isinstance(source, hl.Table):
            if self is source.row:
                return None, source
            if self is source.key:
                return None, source.select()
        if isinstance(source, hl.MatrixTable):
            if self is source.row:
                return None, source.rows()
            if self is source.row_key:
                return None, source.rows().select()
            if self is source.col:
                return None, source.key_cols_by().cols()
            if self is source.col_key:
                return None, source.key_cols_by().cols().select()
            if self is source.entry:
                return None, source.select_rows().select_cols()
        return self._to_relational(fallback_name)

    @typecheck_method(path=str, delimiter=str, missing=str, header=bool)
    def export(self, path, delimiter='\t', missing='NA', header=True):
        """Export a field to a text file.

        Examples
        --------

        >>> small_mt.GT.export('output/gt.tsv')
        >>> with open('output/gt.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	0	1	2	3
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.GT.export('output/gt-no-header.tsv', header=False)
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>

        >>> small_mt.pop.export('output/pops.tsv')
        >>> with open('output/pops.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        sample_idx	pop
        0	2
        1	2
        2	0
        3	2
        <BLANKLINE>

        >>> small_mt.ancestral_af.export('output/ancestral_af.tsv')
        >>> with open('output/ancestral_af.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	ancestral_af
        1:1	["A","C"]	5.3905e-01
        1:2	["A","C"]	8.6768e-01
        1:3	["A","C"]	4.3765e-01
        1:4	["A","C"]	7.6300e-01
        <BLANKLINE>

        >>> mt = small_mt
        >>> small_mt.bn.export('output/bn.tsv')
        >>> with open('output/bn.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        bn
        {"n_populations":3,"n_samples":4,"n_variants":4,"n_partitions":8,"pop_dist":[1,1,1],"fst":[0.1,0.1,0.1],"mixture":false}
        <BLANKLINE>


        Notes
        -----

        For entry-indexed expressions, if there is one column key field, the
        result of calling :func:`~hail.expr.functions.str` on that field is used as
        the column header. Otherwise, each compound column key is converted to
        JSON and used as a column header. For example:

        >>> small_mt = small_mt.key_cols_by(s=small_mt.sample_idx, family='fam1')
        >>> small_mt.GT.export('output/gt-no-header.tsv')
        >>> with open('output/gt-no-header.tsv', 'r') as f:
        ...     for line in f:
        ...         print(line, end='')
        locus	alleles	{"s":0,"family":"fam1"}	{"s":1,"family":"fam1"}	{"s":2,"family":"fam1"}	{"s":3,"family":"fam1"}
        1:1	["A","C"]	0/1	0/1	0/0	0/0
        1:2	["A","C"]	1/1	0/1	1/1	1/1
        1:3	["A","C"]	1/1	0/1	0/1	0/0
        1:4	["A","C"]	1/1	0/1	1/1	1/1
        <BLANKLINE>


        Parameters
        ----------
        path : :class:`str`
            The path to which to export.
        delimiter : :class:`str`
            The string for delimiting columns.
        missing : :class:`str`
            The string to output for missing values.
        header : :obj:`bool`
            When ``True`` include a header line.
        """
        uid = Env.get_uid()
        self_name, ds = self._to_relational_preserving_rows_and_cols(uid)
        if isinstance(ds, hl.Table):
            ds.export(output=path, delimiter=delimiter, header=header)
        else:
            assert len(self._indices.axes) == 2
            entries, cols = Env.get_uid(), Env.get_uid()
            t = ds.select_cols().localize_entries(entries, cols)
            t = t.order_by(*t.key)
            output_col_name = Env.get_uid()
            entry_array = t[entries]
            if self_name:
                entry_array = hl.map(lambda x: x[self_name], entry_array)
            entry_array = hl.map(
                lambda x: hl.if_else(hl.is_missing(x), missing, hl.str(x)),
                entry_array)
            file_contents = t.select(
                **{k: hl.str(t[k])
                   for k in ds.row_key},
                **{output_col_name: hl.delimit(entry_array, delimiter)})
            if header:
                col_key = t[cols]
                if len(ds.col_key) == 1:
                    col_key = hl.map(lambda x: x[0], col_key)
                column_names = hl.map(hl.str,
                                      col_key).collect(_localize=False)[0]
                header_table = hl.utils.range_table(1).key_by().select(
                    **{k: k
                       for k in ds.row_key},
                    **{output_col_name: hl.delimit(column_names, delimiter)})
                file_contents = header_table.union(file_contents)
            file_contents.export(path, delimiter=delimiter, header=False)

    @typecheck_method(n=int, _localize=bool)
    def take(self, n, _localize=True):
        """Collect the first `n` records of an expression.

        Examples
        --------

        Take the first three rows:

        >>> table1.X.take(3)
        [5, 6, 7]

        Warning
        -------
        Extremely experimental.

        Parameters
        ----------
        n : int
            Number of records to take.

        Returns
        -------
        :obj:`list`
        """
        uid = Env.get_uid()
        name, t = self._to_table(uid)
        e = t.take(n, _localize=False).map(lambda r: r[name])
        if _localize:
            return hl.eval(e)
        return e

    @typecheck_method(_localize=bool)
    def collect(self, _localize=True):
        """Collect all records of an expression into a local list.

        Examples
        --------

        Collect all the values from `C1`:

        >>> table1.C1.collect()
        [2, 2, 10, 11]

        Warning
        -------
        Extremely experimental.

        Warning
        -------
        The list of records may be very large.

        Returns
        -------
        :obj:`list`
        """
        uid = Env.get_uid()
        name, t = self._to_table(uid)
        e = t.collect(_localize=False).map(lambda r: r[name])
        if _localize:
            return hl.eval(e)
        return e

    def _extra_summary_fields(self, agg_result):
        return {}

    def _summary_fields(self, agg_result, top):
        if top:
            return {}, self._nested_summary(agg_result[2], top)
        n_missing = agg_result[0]
        n_defined = agg_result[1]
        tot = n_missing + n_defined
        missing_value_str = str(
            n_missing
        ) if n_missing == 0 else f'{n_missing} ({(n_missing / tot) * 100:.2f}%)'
        defined_value_str = str(
            n_defined
        ) if n_defined == 0 else f'{n_defined} ({(n_defined / tot) * 100:.2f}%)'
        if n_defined == 0:
            return {
                'Non-missing': defined_value_str,
                'Missing': missing_value_str
            }, {}
        return {
            'Non-missing': defined_value_str,
            'Missing': missing_value_str,
            **self._extra_summary_fields(agg_result[2])
        }, self._nested_summary(agg_result[2], top)

    def _nested_summary(self, agg_result, top):
        return {}

    def _summary_aggs(self):
        return hl.missing(hl.tint32)

    def _all_summary_aggs(self):
        return hl.tuple((hl.agg.filter(hl.is_missing(self), hl.agg.count()),
                         hl.agg.filter(hl.is_defined(self),
                                       hl.agg.count()), self._summary_aggs()))

    def _summarize(self, agg_res=None, *, name=None, header=None, top=False):
        src = self._indices.source
        summary_header = None
        if src is None or len(self._indices.axes) == 0:
            raise ValueError("Cannot summarize a scalar expression")
        if agg_res is None:
            count, agg_res = self._aggregation_method()(hl.tuple(
                (hl.agg.count(), self._all_summary_aggs())))
            summary_header = f'{count} records.'
        sum_fields, nested = self._summary_fields(agg_res, top)
        summary = Summary(self._type,
                          agg_res[0],
                          sum_fields,
                          nested,
                          header=summary_header)
        if name is None and header is None:
            return summary
        else:
            return NamedSummary(summary, name, header)

    def summarize(self, handler=None):
        """Compute and print summary information about the expression.

        .. include:: _templates/experimental.rst
        """

        src = self._indices.source
        if self in src._fields:
            field_name = src._fields_inverse[self]
            prefix = field_name
        else:
            if self._ir.is_nested_field:
                prefix = self._ir.name
            else:
                prefix = '<expr>'

        if handler is None:
            handler = hl.utils.default_handler()
        handler(self._summarize(name=prefix))

    def _selector_and_agg_method(self):
        src = self._indices.source
        assert src is not None
        assert len(self._indices.axes) > 0
        if isinstance(src, hl.MatrixTable):
            if self._indices == src._row_indices:
                return src.select_rows, lambda t: t.aggregate_rows
            elif self._indices == src._col_indices:
                return src.select_cols, lambda t: t.aggregate_cols
            else:
                return src.select_entries, lambda t: t.aggregate_entries
        else:
            return src.select, lambda t: t.aggregate

    def _aggregation_method(self):
        return self._selector_and_agg_method()[1](self._indices.source)

    def _persist(self):
        src = self._indices.source
        if src is not None:
            raise ValueError(
                "Can only persist a scalar (no Table/MatrixTable source)")
        expr = Env.backend().persist_expression(self)
        assert expr.dtype == self.dtype
        return expr
Ejemplo n.º 29
0
import atexit
import datetime
import difflib
import shutil
import tempfile
from collections import defaultdict, Counter, OrderedDict
from random import Random

import hail
from hail.typecheck import enumeration, typecheck, nullable
from hail.utils.java import Env, joption, error


@typecheck(n_rows=int, n_cols=int, n_partitions=nullable(int))
def range_matrix_table(n_rows, n_cols, n_partitions=None) -> 'hail.MatrixTable':
    """Construct a matrix table with row and column indices and no entry fields.

    Examples
    --------

    >>> range_ds = hl.utils.range_matrix_table(n_rows=100, n_cols=10)

    >>> range_ds.count_rows()
    100

    >>> range_ds.count_cols()
    10

    Notes
    -----
    The resulting matrix table contains the following fields:
Ejemplo n.º 30
0
class HailContext(object):
    @staticmethod
    async def async_create(log: str, quiet: bool, append: bool, tmpdir: str,
                           local_tmpdir: str, default_reference: str,
                           global_seed: Optional[str], backend: Backend):
        hc = HailContext(log=log,
                         quiet=quiet,
                         append=append,
                         tmpdir=tmpdir,
                         local_tmpdir=local_tmpdir,
                         global_seed=global_seed,
                         backend=backend)
        references = await backend._async_get_references(BUILTIN_REFERENCES)
        hc.initialize_references(references, default_reference)
        return hc

    @staticmethod
    def create(log: str, quiet: bool, append: bool, tmpdir: str,
               local_tmpdir: str, default_reference: str,
               global_seed: Optional[str], backend: Backend):
        hc = HailContext(log=log,
                         quiet=quiet,
                         append=append,
                         tmpdir=tmpdir,
                         local_tmpdir=local_tmpdir,
                         global_seed=global_seed,
                         backend=backend)
        references = backend.get_references(BUILTIN_REFERENCES)
        hc.initialize_references(references, default_reference)
        return hc

    @typecheck_method(log=str,
                      quiet=bool,
                      append=bool,
                      tmpdir=str,
                      local_tmpdir=str,
                      global_seed=nullable(int),
                      backend=Backend)
    def __init__(self, log, quiet, append, tmpdir, local_tmpdir, global_seed,
                 backend):
        assert not Env._hc

        self._log = log

        self._tmpdir = tmpdir
        self._local_tmpdir = local_tmpdir

        self._backend = backend

        self._warn_cols_order = True
        self._warn_entries_order = True

        self._default_ref: Optional[ReferenceGenome] = None

        if not quiet:
            py_version = version()
            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(py_version))

            if py_version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        if global_seed is None:
            global_seed = 6348563392232659379
        Env.set_seed(global_seed)
        Env._hc = self

    def initialize_references(self, references, default_reference):
        for ref in references:
            ReferenceGenome._from_config(ref, True)

        if default_reference in ReferenceGenome._references:
            self._default_ref = ReferenceGenome._references[default_reference]
        else:
            self._default_ref = ReferenceGenome.read(default_reference)

    @property
    def default_reference(self) -> ReferenceGenome:
        assert self._default_ref is not None, '_default_ref should have been initialized in HailContext.create'
        return self._default_ref

    def stop(self):
        self._backend.stop()
        self._backend = None
        Env._hc = None
        Env._dummy_table = None
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}
Ejemplo n.º 31
0
import operator
import functools
import hail as hl
from hail.genetics.reference_genome import reference_genome_type
from hail.typecheck import typecheck, nullable, sequenceof
from hail.utils.java import info
from hail.utils import new_temp_file


@typecheck(path=str,
           reference_genome=nullable(reference_genome_type),
           skip_invalid_contigs=bool,
           min_partitions=nullable(int),
           force_bgz=bool,
           force=bool)
def import_gtf(path,
               reference_genome=None,
               skip_invalid_contigs=False,
               min_partitions=None,
               force_bgz=False,
               force=False) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
Ejemplo n.º 32
0
        _col_val=hl.array([hl.array([field, ht[field]]) for field in fields]))
    ht = ht.drop(*fields)
    ht = ht.explode(ht['_col_val'])
    ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]})
    ht = ht.drop('_col_val')

    ht_tmp = new_temp_file()
    ht.write(ht_tmp)

    return hl.read_table(ht_tmp)


@typecheck(ht=Table,
           field=str,
           value=str,
           key=nullable(oneof(str, sequenceof(str))))
def spread(ht, field, value, key=None) -> Table:
    """Spread a key-value pair of fields across multiple fields.

    :func:`.spread` mimics the functionality of the `spread()` function in R's
    `tidyr` package. This is a way to turn "long" format data into "wide"
    format data.

    Given a ``field``, :func:`.spread` will create a new table by grouping
    ``ht`` by its row key and, optionally, any additional fields passed to the
    ``key`` argument.

    After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field
    for each unique value of ``field``, where the row field values are given by the
    corresponding ``value`` in the original ``ht``.
Ejemplo n.º 33
0
@author: nbaya
"""

import hail as hl
from hail.expr.expressions import expr_int32, expr_int64, expr_float32, expr_float64
from hail.typecheck import typecheck, oneof, nullable
from hail.matrixtable import MatrixTable
import re
from datetime import datetime, timedelta

@typecheck(mt=MatrixTable, 
           genotype=oneof(expr_int32,
                          expr_int64, 
                          expr_float32, 
                          expr_float64),
           h2=oneof(nullable(float),
                    nullable(int)),
           pi=oneof(float,int),
           is_annot_inf=bool,
           annot_coef_dict=nullable(dict),
           annot_regex=nullable(str),
           h2_normalize=bool,
           is_popstrat=bool,
           cov_coef_dict=nullable(dict),
           cov_regex=nullable(str),
           path_to_save=nullable(str))
def simulate_phenotypes(mt, genotype, h2=None, pi=1, is_annot_inf=False, annot_coef_dict=None,
                        annot_regex=None,h2_normalize=True, is_popstrat=False, cov_coef_dict=None,
                        cov_regex=None, path_to_save=None):
    r'''Simulate phenotypes for testing LD score regression.