Ejemplo n.º 1
0
class BlockMatrixFilter(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR,
                      indices_to_keep=sequenceof(sequenceof(int)))
    def __init__(self, child, indices_to_keep):
        super().__init__(child)
        self.child = child
        self.indices_to_keep = indices_to_keep

    def head_str(self):
        return _serialize_list(
            [_serialize_list(idxs) for idxs in self.indices_to_keep])

    def _eq(self, other):
        return self.indices_to_keep == other.indices_to_keep

    def _compute_type(self):
        assert len(self.indices_to_keep) == 2
        shape = [
            len(idxs) if len(idxs) != 0 else self.child.typ.shape[i]
            for i, idxs in enumerate(self.indices_to_keep)
        ]

        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            shape[0], shape[1])
        self._type = tblockmatrix(self.child.typ.element_type, tensor_shape,
                                  is_row_vector, self.child.typ.block_size)
Ejemplo n.º 2
0
class ValueToBlockMatrix(BlockMatrixIR):
    @typecheck_method(child=IR,
                      shape=sequenceof(int),
                      block_size=int,
                      dims_partitioned=sequenceof(bool))
    def __init__(self, child, shape, block_size, dims_partitioned):
        super().__init__()
        self.child = child
        self.shape = shape
        self.block_size = block_size
        self.dims_partitioned = dims_partitioned

    def render(self, r):
        return '(ValueToBlockMatrix {} {} {} {})'.format(
            _serialize_ints(self.shape), self.block_size,
            _serialize_ints(self.dims_partitioned), r(self.child))

    def _compute_type(self):
        child_type = self.child.typ
        if isinstance(child_type, tarray):
            element_type = child_type._element_type
        else:
            element_type = child_type

        assert len(self.shape) == 2
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            self.shape[0], self.shape[1])
        self._type = tblockmatrix(element_type, tensor_shape, is_row_vector,
                                  self.block_size, self.dims_partitioned)
Ejemplo n.º 3
0
class BlockMatrixBroadcast(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR,
                      in_index_expr=sequenceof(int),
                      shape=sequenceof(int),
                      block_size=int)
    def __init__(self, child, in_index_expr, shape, block_size):
        super().__init__(child)
        self.child = child
        self.in_index_expr = in_index_expr
        self.shape = shape
        self.block_size = block_size

    def head_str(self):
        return '{} {} {}'.format(_serialize_list(self.in_index_expr),
                                 _serialize_list(self.shape), self.block_size)

    def _eq(self, other):
        return self.in_index_expr == other.in_index_expr and \
            self.shape == other.shape and \
            self.block_size == other.block_size

    def _compute_type(self):
        assert len(self.shape) == 2
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            self.shape[0], self.shape[1])
        self._type = tblockmatrix(self.child.typ.element_type, tensor_shape,
                                  is_row_vector, self.block_size)
Ejemplo n.º 4
0
class BlockMatrixBroadcast(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR,
                      broadcast_kind=str,
                      shape=sequenceof(int),
                      block_size=int,
                      dims_partitioned=sequenceof(bool))
    def __init__(self, child, broadcast_kind, shape, block_size,
                 dims_partitioned):
        super().__init__()
        self.child = child
        self.broadcast_kind = broadcast_kind
        self.shape = shape
        self.block_size = block_size
        self.dims_partitioned = dims_partitioned

    def render(self, r):
        return '(BlockMatrixBroadcast {} ({}) {} ({}) {})'\
            .format(escape_str(self.broadcast_kind),
                    ' '.join([str(x) for x in self.shape]),
                    self.block_size,
                    ' '.join([str(b) for b in self.dims_partitioned]),
                    r(self.child))

    def _compute_type(self):
        self._type = tblockmatrix(self.child.typ.element_type, self.shape,
                                  self.block_size, self.dims_partitioned)
Ejemplo n.º 5
0
class ValueToBlockMatrix(BlockMatrixIR):
    @typecheck_method(child=IR,
                      shape=sequenceof(int),
                      block_size=int,
                      dims_partitioned=sequenceof(bool))
    def __init__(self, child, shape, block_size, dims_partitioned):
        super().__init__()
        self.child = child
        self.shape = shape
        self.block_size = block_size
        self.dims_partitioned = dims_partitioned

    def render(self, r):
        return '(ValueToBlockMatrix ({}) {} ({}) {})'.format(
            ' '.join([str(x) for x in self.shape]), self.block_size,
            ' '.join([str(b) for b in self.dims_partitioned]), r(self.child))

    def _compute_type(self):
        child_type = self.child.typ
        if isinstance(child_type, tarray):
            element_type = child_type._element_type
        else:
            element_type = child_type

        self._type = tblockmatrix(element_type, self.shape, self.block_size,
                                  self.dims_partitioned)
Ejemplo n.º 6
0
class BlockMatrixBroadcast(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR,
                      in_index_expr=sequenceof(int),
                      shape=sequenceof(int),
                      block_size=int,
                      dims_partitioned=sequenceof(bool))
    def __init__(self, child, in_index_expr, shape, block_size,
                 dims_partitioned):
        super().__init__()
        self.child = child
        self.in_index_expr = in_index_expr
        self.shape = shape
        self.block_size = block_size
        self.dims_partitioned = dims_partitioned

    def render(self, r):
        return '(BlockMatrixBroadcast {} {} {} {} {})'\
            .format(_serialize_ints(self.in_index_expr),
                    _serialize_ints(self.shape),
                    self.block_size,
                    _serialize_ints(self.dims_partitioned),
                    r(self.child))

    def _compute_type(self):
        assert len(self.shape) == 2
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            self.shape[0], self.shape[1])
        self._type = tblockmatrix(self.child.typ.element_type, tensor_shape,
                                  is_row_vector, self.block_size,
                                  self.dims_partitioned)
Ejemplo n.º 7
0
class BlockMatrixAgg(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int))
    def __init__(self, child, out_index_expr):
        super().__init__(child)
        self.child = child
        self.out_index_expr = out_index_expr

    def head_str(self):
        return _serialize_list(self.out_index_expr)

    def _eq(self, other):
        return self.out_index_expr == other.out_index_expr

    def _compute_type(self):
        child_matrix_shape = tensor_shape_to_matrix_shape(self.child)
        if self.out_index_expr == [0, 1]:
            is_row_vector = False
            shape = []
        elif self.out_index_expr == [0]:
            is_row_vector = True
            shape = [child_matrix_shape[1]]
        elif self.out_index_expr == [1]:
            is_row_vector = False
            shape = [child_matrix_shape[0]]
        else:
            raise ValueError("Invalid out_index_expr")

        self._type = tblockmatrix(self.child.typ.element_type, shape,
                                  is_row_vector, self.child.typ.block_size)
Ejemplo n.º 8
0
class ValueToBlockMatrix(BlockMatrixIR):
    @typecheck_method(child=IR, shape=sequenceof(int), block_size=int)
    def __init__(self, child, shape, block_size):
        super().__init__(child)
        self.child = child
        self.shape = shape
        self.block_size = block_size

    def head_str(self):
        return '{} {}'.format(_serialize_list(self.shape), self.block_size)

    def _eq(self, other):
        return self.shape == other.shape and \
            self.block_size == other.block_size

    def _compute_type(self):
        child_type = self.child.typ
        if isinstance(child_type, tarray):
            element_type = child_type._element_type
        else:
            element_type = child_type

        assert len(self.shape) == 2
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            self.shape[0], self.shape[1])
        self._type = tblockmatrix(element_type, tensor_shape, is_row_vector,
                                  self.block_size)
Ejemplo n.º 9
0
class BlockMatrixRandom(BlockMatrixIR):
    @typecheck_method(seed=int,
                      gaussian=bool,
                      shape=sequenceof(int),
                      block_size=int)
    def __init__(self, seed, gaussian, shape, block_size):
        super().__init__()
        self.seed = seed
        self.gaussian = gaussian
        self.shape = shape
        self.block_size = block_size

    def head_str(self):
        return '{} {} {} {}'.format(self.seed, self.gaussian,
                                    _serialize_list(self.shape),
                                    self.block_size)

    def _eq(self, other):
        return self.seed == other.seed and \
            self.gaussian == other.gaussian and \
            self.shape == other.shape and \
            self.block_size == other.block_size

    def _compute_type(self):
        assert len(self.shape) == 2
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(
            self.shape[0], self.shape[1])

        self._type = tblockmatrix(hl.tfloat64, tensor_shape, is_row_vector,
                                  self.block_size)
Ejemplo n.º 10
0
class TableNativeReader(TableReader):
    @typecheck_method(path=str,
                      intervals=nullable(sequenceof(anytype)),
                      filter_intervals=bool)
    def __init__(self, path, intervals, filter_intervals):
        if intervals is not None:
            t = hl.expr.impute_type(intervals)
            if not isinstance(t, hl.tarray) and not isinstance(
                    t.element_type, hl.tinterval):
                raise TypeError("'intervals' must be an array of tintervals")
            pt = t.element_type.point_type
            if isinstance(pt, hl.tstruct):
                self._interval_type = t
            else:
                self._interval_type = hl.tarray(
                    hl.tinterval(hl.tstruct(__point=pt)))

        self.path = path
        self.filter_intervals = filter_intervals
        if intervals is not None and t != self._interval_type:
            self.intervals = [
                hl.Interval(hl.Struct(__point=i.start),
                            hl.Struct(__point=i.end), i.includes_start,
                            i.includes_end) for i in intervals
            ]
        else:
            self.intervals = intervals

    def render(self):
        reader = {'name': 'TableNativeReader', 'path': self.path}
        if self.intervals is not None:
            assert self._interval_type is not None
            reader['options'] = {
                'name':
                'NativeReaderOptions',
                'intervals':
                self._interval_type._convert_to_json(self.intervals),
                'intervalPointType':
                self._interval_type.element_type.point_type._parsable_string(),
                'filterIntervals':
                self.filter_intervals
            }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, TableNativeReader) and \
            other.path == self.path and \
            other.intervals == self.intervals and \
            other.filter_intervals == self.filter_intervals
Ejemplo n.º 11
0
class BlockMatrixAgg(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int))
    def __init__(self, child, out_index_expr):
        super().__init__()
        self.child = child
        self.out_index_expr = out_index_expr

    def render(self, r):
        return '(BlockMatrixAgg {} {})' \
            .format(_serialize_list(self.out_index_expr),
                    r(self.child))

    def _compute_type(self):
        shape = [self.child.typ.shape[i] for i in self.out_index_expr]
        is_row_vector = self.out_index_expr == [1]

        self._type = tblockmatrix(self.child.typ.element_type, shape,
                                  is_row_vector, self.child.typ.block_size)
Ejemplo n.º 12
0
class StringTableReader(TableReader):
    @typecheck_method(paths=oneof(str, sequenceof(str)),
                      min_partitions=nullable(int))
    def __init__(self, paths, min_partitions):
        self.paths = paths
        self.min_partitions = min_partitions

    def render(self):
        reader = {
            'name': 'StringTableReader',
            'files': self.paths,
            'minPartitions': self.min_partitions
        }
        return escape_str(json.dumps(reader))

    def __eq__(self, other):
        return isinstance(other, StringTableReader) and \
            other.path == self.path and \
            other.min_partitions == self.min_partitions
Ejemplo n.º 13
0
class BlockMatrixSlice(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR, slices=sequenceof(slice))
    def __init__(self, child, slices):
        super().__init__(child)
        self.child = child
        self.slices = slices

    def head_str(self):
        return '{}'.format(_serialize_list([f'({s.start} {s.stop} {s.step})' for s in self.slices]))

    def _eq(self, other):
        return self.slices == other.slices

    def _compute_type(self):
        assert len(self.slices) == 2
        matrix_shape = [1 + (s.stop - s.start - 1) // s.step for s in self.slices]
        tensor_shape, is_row_vector = _matrix_shape_to_tensor_shape(matrix_shape[0], matrix_shape[1])
        self._type = tblockmatrix(self.child.typ.element_type,
                                  tensor_shape,
                                  is_row_vector,
                                  self.child.typ.block_size)
Ejemplo n.º 14
0
class BlockMatrixAgg(BlockMatrixIR):
    @typecheck_method(child=BlockMatrixIR, out_index_expr=sequenceof(int))
    def __init__(self, child, out_index_expr):
        super().__init__(child)
        self.child = child
        self.out_index_expr = out_index_expr

    def head_str(self):
        return _serialize_list(self.out_index_expr)

    def _eq(self, other):
        return self.out_index_expr == other.out_index_expr

    def _compute_type(self):
        shape = [
            tensor_shape_to_matrix_shape(self.child)[i]
            for i in self.out_index_expr
        ]
        is_row_vector = self.out_index_expr == [1]
        self._type = tblockmatrix(self.child.typ.element_type, shape,
                                  is_row_vector, self.child.typ.block_size)
Ejemplo n.º 15
0
import hail as hl
from hail.expr.expressions import expr_float64, expr_numeric, analyze
from hail.typecheck import typecheck, oneof, sequenceof, nullable
from hail.table import Table
from hail.matrixtable import MatrixTable
from hail.utils import wrap_to_list, new_temp_file
import numpy as np


@typecheck(weight_expr=expr_float64,
           ld_score_expr=expr_numeric,
           chi_sq_exprs=oneof(expr_float64, sequenceof(expr_float64)),
           n_samples_exprs=oneof(expr_numeric, sequenceof(expr_numeric)),
           n_blocks=int,
           two_step_threshold=int,
           n_reference_panel_variants=nullable(int))
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:
Ejemplo n.º 16
0
from functools import reduce

import hail as hl
from hail.expr.functions import _ndarray
from hail.expr.functions import array as aarray
from hail.expr.types import HailType, tfloat64, ttuple, tndarray
from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof
from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple,
                                   expr_any, expr_array, expr_ndarray,
                                   expr_numeric, Int64Expression, cast_expr,
                                   construct_expr)
from hail.expr.expressions.typed_expressions import NDArrayNumericExpression
from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat

tsequenceof_nd = oneof(sequenceof(expr_ndarray()), tupleof(expr_ndarray()),
                       expr_array(expr_ndarray()))
shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple())


def array(input_array, dtype=None):
    """Construct an :class:`.NDArrayExpression`

    Examples
    --------

    >>> hl.eval(hl.nd.array([1, 2, 3, 4]))
    array([1, 2, 3, 4], dtype=int32)

    >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]]))
    array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)
Ejemplo n.º 17
0
from typing import List

from hail import MatrixTable
from hail.ir import MatrixMultiWrite, MatrixNativeMultiWriter
from hail.typecheck import sequenceof, typecheck
from hail.utils.java import Env


@typecheck(mts=sequenceof(MatrixTable),
           prefix=str,
           overwrite=bool,
           stage_locally=bool)
def write_matrix_tables(mts: List[MatrixTable],
                        prefix: str,
                        overwrite: bool = False,
                        stage_locally: bool = False):
    writer = MatrixNativeMultiWriter(prefix, overwrite, stage_locally)
    Env.backend().execute(MatrixMultiWrite([mt._mir for mt in mts], writer))
Ejemplo n.º 18
0
import numpy as np
import hail as hl
from hail.table import Table
from hail.linalg import BlockMatrix
from hail.typecheck import typecheck, nullable, sequenceof, oneof
from hail.expr.expressions import expr_float64, expr_numeric, expr_locus
from hail.utils import new_temp_file, wrap_to_list


@typecheck(entry_expr=expr_float64,
           locus_expr=expr_locus(),
           radius=oneof(int, float),
           coord_expr=nullable(expr_float64),
           annotation_exprs=nullable(oneof(expr_numeric,
                                           sequenceof(expr_numeric))),
           block_size=nullable(int))
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
Ejemplo n.º 19
0
class Pedigree(object):
    """Class containing a list of trios, with extra functionality.

    :param trios: list of trio objects to include in pedigree
    :type trios: list of :class:`.Trio`
    """
    @typecheck_method(trios=sequenceof(Trio))
    def __init__(self, trios):
        self._trios = tuple(trios)

    def __eq__(self, other):
        return isinstance(other, Pedigree) and self._trios == other._trios

    def __hash__(self):
        return hash(self._trios)

    def __iter__(self):
        return self._trios.__iter__()

    @classmethod
    @typecheck_method(fam_path=str, delimiter=str)
    def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree':
        """Read a PLINK .fam file and return a pedigree object.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')

        Notes
        -------

        See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for
        the required format.

        :param str fam_path: path to .fam file.

        :param str delimiter: Field delimiter.

        :rtype: :class:`.Pedigree`
        """

        trios = []
        missing_sex_count = 0
        missing_sex_values = set()
        with Env.fs().open(fam_path) as file:
            for line in file:
                split_line = re.split(delimiter, line.strip())
                num_fields = len(split_line)
                if num_fields != 6:
                    raise FatalError(
                        "Require 6 fields per line in .fam, but this line has {}: {}"
                        .format(num_fields, line))
                (fam, kid, dad, mom, sex, _) = tuple(split_line)
                # 1 is male, 2 is female, 0 is unknown.
                is_female = sex == "2" if sex == "1" or sex == "2" else None

                if is_female is None:
                    missing_sex_count += 1
                    missing_sex_values.add(kid)

                trio = Trio(kid, fam if fam != "0" else None,
                            dad if dad != "0" else None,
                            mom if mom != "0" else None, is_female)
                trios.append(trio)

        only_ids = [trio.s for trio in trios]
        duplicate_ids = [
            id for id, count in Counter(only_ids).items() if count > 1
        ]
        if duplicate_ids:
            raise FatalError(
                "Invalid pedigree: found duplicate proband IDs\n{}".format(
                    duplicate_ids))

        if missing_sex_count > 0:
            warning(
                "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]"
                .format(missing_sex_count, missing_sex_values))

        return Pedigree(trios)

    @property
    def trios(self):
        """List of trio objects in this pedigree.

        :rtype: list of :class:`.Trio`
        """
        return self._trios

    def complete_trios(self):
        """List of trio objects that have a defined father and mother.

        :rtype: list of :class:`.Trio`
        """
        return list(filter(lambda t: t.is_complete(), self.trios))

    @typecheck_method(samples=sequenceof(nullable(str)))
    def filter_to(self, samples):
        """Filter the pedigree to a given list of sample IDs.

        **Notes**

        For any trio, the following steps will be applied:

         - If the proband is not in the list of samples provided, the trio is removed.
         - If the father is not in the list of samples provided, `pat_id` is set to ``None``.
         - If the mother is not in the list of samples provided, `mat_id` is set to ``None``.

        :param samples: list of sample IDs to keep
        :type samples: list of str

        :rtype: :class:`.Pedigree`
        """
        sample_set = set(samples)

        filtered_trios = []
        for trio in self._trios:
            restricted_trio = trio._restrict_to(sample_set)
            if restricted_trio is not None:
                filtered_trios.append(restricted_trio)

        return Pedigree(filtered_trios)

    @typecheck_method(path=str)
    def write(self, path):
        """Write a .fam file to the given path.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')
        >>> ped.write('output/out.fam')

        **Notes**

        This method writes a `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_.

        .. caution::

            Phenotype information is not preserved in the Pedigree data structure in Hail.
            Reading and writing a PLINK .fam file will result in loss of this information.
            Use the key table method :meth:`~hail.KeyTable.import_fam` to manipulate this
            information.

        :param path: output path
        :type path: str
        """

        lines = [t._to_fam_file_line() for t in self._trios]

        with Env.fs().open(path, mode="w") as file:
            for line in lines:
                file.write(line + "\n")
Ejemplo n.º 20
0
from typing import List, Optional

from hail import MatrixTable
from hail.linalg import BlockMatrix
from hail.ir import MatrixMultiWrite, MatrixNativeMultiWriter, BlockMatrixMultiWrite, BlockMatrixBinaryMultiWriter, BlockMatrixTextMultiWriter, BlockMatrixNativeMultiWriter
from hail.typecheck import nullable, sequenceof, typecheck, enumeration
from hail.utils.java import Env


@typecheck(mts=sequenceof(MatrixTable),
           prefix=str,
           overwrite=bool,
           stage_locally=bool)
def write_matrix_tables(mts: List[MatrixTable],
                        prefix: str,
                        overwrite: bool = False,
                        stage_locally: bool = False):
    length = len(str(len(mts) - 1))
    paths = [
        f"{prefix}{str(i).rjust(length, '0')}.mt" for i in range(len(mts))
    ]
    writer = MatrixNativeMultiWriter(paths, overwrite, stage_locally)
    Env.backend().execute(MatrixMultiWrite([mt._mir for mt in mts], writer))


@typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool)
def block_matrices_tofiles(bms: List[BlockMatrix],
                           prefix: str,
                           overwrite: bool = False):
    writer = BlockMatrixBinaryMultiWriter(prefix, overwrite)
    Env.backend().execute(
Ejemplo n.º 21
0
class ttable(object):
    @staticmethod
    def _from_java(jtt):
        return ttable(dtype(jtt.globalType().toString()),
                      dtype(jtt.rowType().toString()),
                      jiterable_to_list(jtt.key()))

    @staticmethod
    def _from_json(json):
        return ttable(dtype(json['global']), dtype(json['row']),
                      json['row_key'])

    @typecheck_method(global_type=tstruct,
                      row_type=tstruct,
                      row_key=sequenceof(str))
    def __init__(self, global_type, row_type, row_key):
        self.global_type = global_type
        self.row_type = row_type
        self.row_key = row_key

    def __eq__(self, other):
        return (isinstance(other, ttable)
                and self.global_type == other.global_type
                and self.row_type == other.row_type
                and self.row_key == other.row_key)

    def __hash__(self):
        return 43 + hash(str(self))

    def __repr__(self):
        return f'ttable(global_type={self.global_type!r}, row_type={self.row_type!r}, row_key={self.row_key!r})'

    def _key_str(self):
        return ', '.join([escape_parsable(k) for k in self.row_key])

    def __str__(self):
        return f'table {{global: {self.global_type}, row: {self.row_type}, row_key: [{self._key_str()}]}}'

    def pretty(self, indent=0, increment=4):
        l = []
        l.append(' ' * indent)
        l.append('table {\n')
        indent += increment

        l.append(' ' * indent)
        l.append('global: ')
        self.global_type._pretty(l, indent, increment)
        l.append(',\n')

        l.append(' ' * indent)
        l.append('row: ')
        self.row_type._pretty(l, indent, increment)
        l.append(',\n')

        l.append(' ' * indent)
        l.append(f'row_key: [{self._key_str()}]\n')

        indent -= increment
        l.append(' ' * indent)
        l.append('}')

        return ''.join(l)

    @property
    def key_type(self):
        return self.row_type._select_fields(self.row_key)

    @property
    def value_type(self):
        return self.row_type._drop_fields(set(self.row_key))

    def _rename(self, global_map, row_map):
        return ttable(self.global_type._rename(global_map),
                      self.row_type._rename(row_map),
                      [row_map.get(k, k) for k in self.row_key])

    def row_env(self, default_value=None):
        if default_value is None:
            return {'global': self.global_type, 'row': self.row_type}
        else:
            return {'global': default_value, 'row': default_value}

    def global_env(self, default_value=None):
        if default_value is None:
            return {'global': self.global_type}
        else:
            return {'global': default_value}
Ejemplo n.º 22
0
                                       includes_start=True,
                                       includes_end=True,
                                       reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(
            hl.struct(seqname=ht['seqname'], position=ht['start']),
            hl.struct(seqname=ht['seqname'], position=ht['end']),
            includes_start=True,
            includes_end=True))

    ht = ht.key_by('interval')

    return ht


@typecheck(gene_symbols=nullable(sequenceof(str)),
           gene_ids=nullable(sequenceof(str)),
           transcript_ids=nullable(sequenceof(str)),
           verbose=bool,
           reference_genome=nullable(reference_genome_type),
           gtf_file=nullable(str))
def get_gene_intervals(gene_symbols=None,
                       gene_ids=None,
                       transcript_ids=None,
                       verbose=True,
                       reference_genome=None,
                       gtf_file=None):
    """Get intervals of genes or transcripts.

    Get the boundaries of genes or transcripts from a GTF file, for quick filtering of a Table or MatrixTable.
Ejemplo n.º 23
0
class tblockmatrix(object):
    @staticmethod
    def _from_java(jtbm):
        return tblockmatrix(dtype(jtbm.elementType().toString()),
                            jiterable_to_list(jtbm.shape()),
                            jtbm.isRowVector(), jtbm.blockSize())

    @staticmethod
    def _from_json(json):
        return tblockmatrix(dtype(json['element_type']), json['shape'],
                            json['is_row_vector'], json['block_size'])

    @typecheck_method(element_type=hail_type,
                      shape=sequenceof(int),
                      is_row_vector=bool,
                      block_size=int)
    def __init__(self, element_type, shape, is_row_vector, block_size):
        self.element_type = element_type
        self.shape = shape
        self.is_row_vector = is_row_vector
        self.block_size = block_size

    def __eq__(self, other):
        return isinstance(other, tblockmatrix) and \
               self.element_type == other.element_type and \
               self.shape == other.shape and \
               self.is_row_vector == other.is_row_vector and \
               self.block_size == other.block_size

    def __hash__(self):
        return 43 + hash(str(self))

    def __repr__(self):
        return f'tblockmatrix(element_type={self.element_type!r}, shape={self.shape!r}, ' \
            f'is_row_vector={self.is_row_vector!r}, block_size={self.block_size!r})'

    def __str__(self):
        return f'blockmatrix {{element_type: {self.element_type}, shape: {self.shape}, ' \
            f'is_row_vector: {self.is_row_vector}, block_size: {self.block_size})'

    def pretty(self, indent=0, increment=4):
        l = []
        l.append(' ' * indent)
        l.append('blockmatrix {\n')
        indent += increment

        l.append(' ' * indent)
        l.append('element_type: ')
        self.element_type._pretty(l, indent, increment)
        l.append(',\n')

        l.append(' ' * indent)
        l.append(f'shape: [{self.shape}],\n')

        l.append(' ' * indent)
        l.append('is_row_vector: ')
        self.is_row_vector._pretty(l, indent, increment)
        l.append(',\n')

        l.append(' ' * indent)
        l.append('block_size: ')
        self.block_size._pretty(l, indent, increment)
        l.append(',\n')

        indent -= increment
        l.append(' ' * indent)
        l.append('}')

        return ''.join(l)
Ejemplo n.º 24
0
            **tm.proband_entry,
            **{phased_call_field: tm.__phased_GT[0]}
        ),
        father_entry=hl.struct(
            **tm.father_entry,
            **{phased_call_field: tm.__phased_GT[1]}
        ),
        mother_entry=hl.struct(
            **tm.mother_entry,
            **{phased_call_field: tm.__phased_GT[2]}
        )
    )


@typecheck(tm=MatrixTable,
           col_keys=sequenceof(str),
           keep_trio_cols=bool,
           keep_trio_entries=bool)
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s'], keep_trio_cols: bool = True, keep_trio_entries: bool = False) -> hl.MatrixTable:
    """Splits a trio MatrixTable back into a sample MatrixTable.

    Example
    -------
    >>> # Create a trio matrix from a sample matrix
    >>> pedigree = hl.Pedigree.read('data/case_control_study.fam')
    >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True)

    >>> # Explode trio matrix back into a sample matrix
    >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset)

    Notes
Ejemplo n.º 25
0
        })


def localize(mt):
    if isinstance(mt, MatrixTable):
        return mt._localize_entries('__entries', '__cols')
    return mt


def unlocalize(mt):
    if isinstance(mt, Table):
        return mt._unlocalize_entries('__entries', '__cols', ['s'])
    return mt


@typecheck(mt=oneof(Table, MatrixTable), info_to_keep=sequenceof(str))
def transform_gvcf(mt, info_to_keep=[]) -> Table:
    """Transforms a gvcf into a sparse matrix table

    The input to this should be some result of either :func:`.import_vcf` or
    :func:`.import_gvcfs` with ``array_elements_required=False``.

    There is an assumption that this function will be called on a matrix table
    with one column (or a localized table version of the same).

    Parameters
    ----------
    mt : :obj:`Union[Table, MatrixTable]`
        The gvcf being transformed, if it is a table, then it must be a localized matrix table with
        the entries array named ``__entries``
    info_to_keep : :obj:`List[str]`
Ejemplo n.º 26
0
class tmatrix(object):
    @staticmethod
    def _from_java(jtt):
        return tmatrix(dtype(jtt.globalType().toString()),
                       dtype(jtt.colType().toString()),
                       jiterable_to_list(jtt.colKey()),
                       dtype(jtt.rowType().toString()),
                       jiterable_to_list(jtt.rowKey()),
                       dtype(jtt.entryType().toString()))

    @staticmethod
    def _from_json(json):
        return tmatrix(dtype(json['global']), dtype(json['col']),
                       json['col_key'], dtype(json['row']), json['row_key'],
                       dtype(json['entry']))

    @typecheck_method(global_type=tstruct,
                      col_type=tstruct,
                      col_key=sequenceof(str),
                      row_type=tstruct,
                      row_key=sequenceof(str),
                      entry_type=tstruct)
    def __init__(self, global_type, col_type, col_key, row_type, row_key,
                 entry_type):
        self.global_type = global_type
        self.col_type = col_type
        self.col_key = col_key
        self.row_type = row_type
        self.row_key = row_key
        self.entry_type = entry_type

    def __eq__(self, other):
        return (isinstance(other, tmatrix)
                and self.global_type == other.global_type
                and self.col_type == other.col_type
                and self.col_key == other.col_key
                and self.row_type == other.row_type
                and self.row_key == other.row_key
                and self.entry_type == other.entry_type)

    def __hash__(self):
        return 43 + hash(str(self))

    def __repr__(self):
        return f'tmatrix(global_type={self.global_type!r}, col_type={self.col_type!r}, col_key={self.col_key!r}, row_type={self.row_type!r}, row_key={self.row_key!r}, entry_type={self.entry_type!r})'

    def _row_key_str(self):
        return ', '.join([escape_parsable(k) for k in self.row_key])

    def _col_key_str(self):
        return ', '.join([escape_parsable(k) for k in self.col_key])

    def __str__(self):
        return f'matrix {{global: {self.global_type}, col: {self.col_type}, col_key: {self._col_key_str()}, row: {self.row_type}, row_key: [{self._row_key_str()}], entry: {self.entry_type}}}'

    def pretty(self, indent=0, increment=4):
        b = []
        b.append(' ' * indent)
        b.append('matrix {\n')
        indent += increment

        b.append(' ' * indent)
        b.append('global: ')
        self.global_type._pretty(b, indent, increment)
        b.append(',\n')

        b.append(' ' * indent)
        b.append('row: ')
        self.row_type._pretty(b, indent, increment)
        b.append(',\n')

        b.append(' ' * indent)
        b.append(f'row_key: [{self._row_key_str()}],\n')

        b.append(' ' * indent)
        b.append('col: ')
        self.col_type._pretty(b, indent, increment)
        b.append(',\n')

        b.append(' ' * indent)
        b.append(f'col_key: [{self._col_key_str()}],\n')

        b.append(' ' * indent)
        b.append('entry: ')
        self.entry_type._pretty(b, indent, increment)
        b.append('\n')

        indent -= increment
        b.append(' ' * indent)
        b.append('}')

        return ''.join(b)

    @property
    def col_key_type(self):
        return self.col_type._select_fields(self.col_key)

    @property
    def col_value_type(self):
        return self.col_type._drop_fields(set(self.col_key))

    @property
    def row_key_type(self):
        return self.row_type._select_fields(self.row_key)

    @property
    def row_value_type(self):
        return self.row_type._drop_fields(set(self.row_key))

    def _rename(self, global_map, col_map, row_map, entry_map):
        return tmatrix(self.global_type._rename(global_map),
                       self.col_type._rename(col_map),
                       [col_map.get(k, k) for k in self.col_key],
                       self.row_type._rename(row_map),
                       [row_map.get(k, k) for k in self.row_key],
                       self.entry_type._rename(entry_map))

    def global_env(self, default_value=None):
        if default_value is None:
            return {'global': self.global_type}
        else:
            return {'global': default_value}

    def row_env(self, default_value=None):
        if default_value is None:
            return {'global': self.global_type, 'va': self.row_type}
        else:
            return {'global': default_value, 'va': default_value}

    def col_env(self, default_value=None):
        if default_value is None:
            return {'global': self.global_type, 'sa': self.col_type}
        else:
            return {'global': default_value, 'sa': default_value}

    def entry_env(self, default_value=None):
        if default_value is None:
            return {
                'global': self.global_type,
                'va': self.row_type,
                'sa': self.col_type,
                'g': self.entry_type
            }
        else:
            return {
                'global': default_value,
                'va': default_value,
                'sa': default_value,
                'g': default_value
            }
Ejemplo n.º 27
0
class ReferenceGenome(object):
    """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__.

    Examples
    --------

    >>> contigs = ["1", "X", "Y", "MT"]
    >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569}
    >>> par = [("X", 60001, 2699521)]
    >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par)

    Notes
    -----
    Hail comes with predefined reference genomes (case sensitive!):

     - GRCh37, Genome Reference Consortium Human Build 37
     - GRCh38, Genome Reference Consortium Human Build 38
     - GRCm38, Genome Reference Consortium Mouse Build 38
     - CanFam3, Canis lupus familiaris (dog)

    You can access these reference genome objects using :func:`~hail.get_reference`:

    >>> rg = hl.get_reference('GRCh37')
    >>> rg = hl.get_reference('GRCh38')
    >>> rg = hl.get_reference('GRCm38')
    >>> rg = hl.get_reference('CanFam3')

    Note that constructing a new reference genome, either by using the class
    constructor or by using `read` will add the reference genome to the list of
    known references; it is possible to access the reference genome using
    :func:`~hail.get_reference` anytime afterwards.

    Note
    ----
    Reference genome names must be unique. It is not possible to overwrite the
    built-in reference genomes.

    Parameters
    ----------
    name : :class:`str`
        Name of reference. Must be unique and NOT one of Hail's
        predefined references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``,
        ``'CanFam3'`` and ``'default'``.
    contigs : :obj:`list` of :class:`str`
        Contig names.
    lengths : :obj:`dict` of :class:`str` to :obj:`int`
        Dict of contig names to contig lengths.
    x_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as X chromosomes.
    y_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as Y chromosomes.
    mt_contigs : :class:`str` or :obj:`list` of :obj:`str`
        Contigs to be treated as mitochondrial DNA.
    par : :obj:`list` of :obj:`tuple` of (str, int, int)
        List of tuples with (contig, start, end)
    """

    _references = {}

    @classmethod
    def _from_config(cls, config, _builtin=False):
        def par_tuple(p):
            assert p['start']['contig'] == p['end']['contig']
            return (p['start']['contig'], p['start']['position'],
                    p['end']['position'])

        contigs = config['contigs']
        return ReferenceGenome(config['name'], [c['name'] for c in contigs],
                               {c['name']: c['length']
                                for c in contigs}, config['xContigs'],
                               config['yContigs'], config['mtContigs'],
                               [par_tuple(p) for p in config['par']], _builtin)

    @typecheck_method(name=str,
                      contigs=sequenceof(str),
                      lengths=dictof(str, int),
                      x_contigs=oneof(str, sequenceof(str)),
                      y_contigs=oneof(str, sequenceof(str)),
                      mt_contigs=oneof(str, sequenceof(str)),
                      par=sequenceof(sized_tupleof(str, int, int)),
                      _builtin=bool)
    def __init__(self,
                 name,
                 contigs,
                 lengths,
                 x_contigs=[],
                 y_contigs=[],
                 mt_contigs=[],
                 par=[],
                 _builtin=False):
        super(ReferenceGenome, self).__init__()

        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name':
            name,
            'contigs': [{
                'name': c,
                'length': l
            } for c, l in lengths.items()],
            'xContigs':
            x_contigs,
            'yContigs':
            y_contigs,
            'mtContigs':
            mt_contigs,
            'par': [{
                'start': {
                    'contig': c,
                    'position': s
                },
                'end': {
                    'contig': c,
                    'position': e
                }
            } for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [
            hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self))
            for (c, s, e) in par
        ]
        self._global_positions = None

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        self._sequence_files = None
        self._liftovers = dict()

    def __str__(self):
        return self._config['name']

    def __repr__(self):
        return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \
               (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple)

    def __eq__(self, other):
        return isinstance(other,
                          ReferenceGenome) and self._config == other._config

    def __hash__(self):
        return hash(self.name)

    @property
    def name(self):
        """Name of reference genome.

        Returns
        -------
        :class:`str`
        """
        return self._config['name']

    @property
    def contigs(self):
        """Contig names.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._contigs

    @property
    def lengths(self):
        """Dict of contig name to contig length.

        Returns
        -------
        :obj:`dict` of :class:`str` to :obj:`int`
        """
        return self._lengths

    @property
    def x_contigs(self):
        """X contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['xContigs']

    @property
    def y_contigs(self):
        """Y contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['yContigs']

    @property
    def mt_contigs(self):
        """Mitochondrial contigs.

        Returns
        -------
        :obj:`list` of :class:`str`
        """
        return self._config['mtContigs']

    @property
    def par(self):
        """Pseudoautosomal regions.

        Returns
        -------
        :obj:`list` of :class:`.Interval`
        """

        return self._par

    @typecheck_method(contig=str)
    def contig_length(self, contig):
        """Contig length.

        Parameters
        ----------
        contig : :class:`str`
            Contig name.

        Returns
        -------
        :obj:`int`
            Length of contig.
        """
        if contig in self.lengths:
            return self.lengths[contig]
        else:
            raise KeyError(
                "Contig `{}' is not in reference genome.".format(contig))

    @typecheck_method(contig=str)
    def _contig_global_position(self, contig):
        if self._global_positions is None:
            gp = {}
            lengths = self._lengths
            x = 0
            for c in self.contigs:
                gp[c] = x
                x += lengths[c]
            self._global_positions = gp
        return self._global_positions[contig]

    @classmethod
    @typecheck_method(path=str)
    def read(cls, path):
        """Load reference genome from a JSON file.

        Notes
        -----

        The JSON file must have the following format:

        .. code-block:: text

            {"name": "my_reference_genome",
             "contigs": [{"name": "1", "length": 10000000},
                         {"name": "2", "length": 20000000},
                         {"name": "X", "length": 19856300},
                         {"name": "Y", "length": 78140000},
                         {"name": "MT", "length": 532}],
             "xContigs": ["X"],
             "yContigs": ["Y"],
             "mtContigs": ["MT"],
             "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}},
                     {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}]
            }


        `name` must be unique and not overlap with Hail's pre-instantiated
        references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'``, and
        ``'default'``.
        The contig names in `xContigs`, `yContigs`, and `mtContigs` must be
        present in `contigs`. The intervals listed in `par` must have contigs in
        either `xContigs` or `yContigs` and must have positions between 0 and
        the contig length given in `contigs`.

        Parameters
        ----------
        path : :class:`str`
            Path to JSON file.

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        with hl.hadoop_open(path) as f:
            return ReferenceGenome._from_config(json.load(f))

    @typecheck_method(output=str)
    def write(self, output):
        """"Write this reference genome to a file in JSON format.

        Examples
        --------

        >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200})
        >>> my_rg.write(f"output/new_reference.json")

        Notes
        -----

        Use :meth:`~hail.genetics.ReferenceGenome.read` to reimport the exported
        reference genome in a new HailContext session.

        Parameters
        ----------
        output : :class:`str`
            Path of JSON file to write.
        """
        with hl.utils.hadoop_open(output, 'w') as f:
            json.dump(self._config, f)

    @typecheck_method(fasta_file=str, index_file=nullable(str))
    def add_sequence(self, fasta_file, index_file=None):
        """Load the reference sequence from a FASTA file.

        Examples
        --------
        Access the GRCh37 reference genome using :func:`~hail.get_reference`:

        >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP

        Add a sequence file:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz',
        ...                 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP

        Add a sequence file with the default index location:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP


        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_sequence` to test whether a sequence is loaded.

        FASTA and index files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37**

        - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz``
        - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai``

        **GRCh38**

        - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz``
        - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai``

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        fasta_file : :class:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`None` or :class:`str`
            Path to FASTA index file. Must be uncompressed. If `None`, replace
            the fasta_file's extension with `fai`.
        """
        if index_file is None:
            index_file = re.sub(r'\.[^.]*$', '.fai', fasta_file)
        Env.backend().add_sequence(self.name, fasta_file, index_file)
        self._sequence_files = (fasta_file, index_file)

    def has_sequence(self):
        """True if the reference sequence has been loaded.

        Returns
        -------
        :obj:`bool`
        """
        return self._sequence_files is not None

    def remove_sequence(self):
        """Remove the reference sequence."""
        self._sequence_files = None
        Env.backend().remove_sequence(self.name)

    @classmethod
    @typecheck_method(name=str,
                      fasta_file=str,
                      index_file=str,
                      x_contigs=oneof(str, sequenceof(str)),
                      y_contigs=oneof(str, sequenceof(str)),
                      mt_contigs=oneof(str, sequenceof(str)),
                      par=sequenceof(sized_tupleof(str, int, int)))
    def from_fasta_file(cls,
                        name,
                        fasta_file,
                        index_file,
                        x_contigs=[],
                        y_contigs=[],
                        mt_contigs=[],
                        par=[]):
        """Create reference genome from a FASTA file.

        Parameters
        ----------
        name: :class:`str`
            Name for new reference genome.
        fasta_file : :class:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :class:`str`
            Path to FASTA index file. Must be uncompressed.
        x_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as X chromosomes.
        y_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as Y chromosomes.
        mt_contigs : :class:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as mitochondrial DNA.
        par : :obj:`list` of :obj:`tuple` of (str, int, int)
            List of tuples with (contig, start, end)

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        par_strings = [
            "{}:{}-{}".format(contig, start, end)
            for (contig, start, end) in par
        ]
        Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs,
                                      y_contigs, mt_contigs, par_strings)

        rg = ReferenceGenome._from_config(Env.backend().get_reference(name),
                                          _builtin=True)
        rg._sequence_files = (fasta_file, index_file)
        return rg

    @typecheck_method(dest_reference_genome=reference_genome_type)
    def has_liftover(self, dest_reference_genome):
        """``True`` if a liftover chain file is available from this reference
        genome to the destination reference.

        Parameters
        ----------
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`

        Returns
        -------
        :obj:`bool`
        """
        return dest_reference_genome.name in self._liftovers

    @typecheck_method(dest_reference_genome=reference_genome_type)
    def remove_liftover(self, dest_reference_genome):
        """Remove liftover to `dest_reference_genome`.

        Parameters
        ----------
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`
        """
        if dest_reference_genome.name in self._liftovers:
            del self._liftovers[dest_reference_genome.name]
            Env.backend().remove_liftover(self.name,
                                          dest_reference_genome.name)

    @typecheck_method(chain_file=str,
                      dest_reference_genome=reference_genome_type)
    def add_liftover(self, chain_file, dest_reference_genome):
        """Register a chain file for liftover.

        Examples
        --------
        Access GRCh37 and GRCh38 using :func:`~hail.get_reference`:

        >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP
        >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP

        Add a chain file from 37 to 38:

        >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP

        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_liftover` to test whether a chain file has been registered.

        The chain file format is described
        `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__.

        Chain files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37 to GRCh38**
        gs://hail-common/references/grch37_to_grch38.over.chain.gz

        **GRCh38 to GRCh37**
        gs://hail-common/references/grch38_to_grch37.over.chain.gz

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        chain_file : :class:`str`
            Path to chain file. Can be compressed (GZIP) or uncompressed.
        dest_reference_genome : :class:`str` or :class:`.ReferenceGenome`
            Reference genome to convert to.
        """

        Env.backend().add_liftover(self.name, chain_file,
                                   dest_reference_genome.name)
        if dest_reference_genome.name in self._liftovers:
            raise KeyError(
                f"Liftover already exists from {self.name} to {dest_reference_genome.name}."
            )
        self._liftovers[dest_reference_genome.name] = chain_file
Ejemplo n.º 28
0
Archivo: nd.py Proyecto: saponas/hail
from functools import reduce

import hail as hl
from hail.expr.functions import _ndarray
from hail.expr.functions import array as aarray
from hail.expr.types import HailType, tfloat64, ttuple, tndarray
from hail.typecheck import typecheck, nullable, oneof, tupleof, sequenceof
from hail.expr.expressions import (expr_int32, expr_int64, expr_tuple,
                                   expr_any, expr_array, expr_ndarray,
                                   expr_numeric, Int64Expression, cast_expr,
                                   construct_expr)
from hail.expr.expressions.typed_expressions import NDArrayNumericExpression
from hail.ir import NDArrayQR, NDArrayInv, NDArrayConcat, NDArraySVD, Apply

tsequenceof_nd = oneof(sequenceof(expr_ndarray()), expr_array(expr_ndarray()))
shape_type = oneof(expr_int64, tupleof(expr_int64), expr_tuple())


def array(input_array, dtype=None):
    """Construct an :class:`.NDArrayExpression`

    Examples
    --------

    >>> hl.eval(hl.nd.array([1, 2, 3, 4]))
    array([1, 2, 3, 4], dtype=int32)

    >>> hl.eval(hl.nd.array([[1, 2, 3], [4, 5, 6]]))
    array([[1, 2, 3],
       [4, 5, 6]], dtype=int32)
Ejemplo n.º 29
0
        _col_val=hl.array([hl.array([field, ht[field]]) for field in fields]))
    ht = ht.drop(*fields)
    ht = ht.explode(ht['_col_val'])
    ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]})
    ht = ht.drop('_col_val')

    ht_tmp = new_temp_file()
    ht.write(ht_tmp)

    return hl.read_table(ht_tmp)


@typecheck(ht=Table,
           field=str,
           value=str,
           key=nullable(oneof(str, sequenceof(str))))
def spread(ht, field, value, key=None) -> Table:
    """Spread a key-value pair of fields across multiple fields.

    :func:`.spread` mimics the functionality of the `spread()` function in R's
    `tidyr` package. This is a way to turn "long" format data into "wide"
    format data.

    Given a ``field``, :func:`.spread` will create a new table by grouping
    ``ht`` by its row key and, optionally, any additional fields passed to the
    ``key`` argument.

    After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field
    for each unique value of ``field``, where the row field values are given by the
    corresponding ``value`` in the original ``ht``.
Ejemplo n.º 30
0
    ht = ht.explode(ht['_col_val'])
    ht = ht.annotate(**{key: ht['_col_val'][0],
                        value: ht['_col_val'][1]})
    ht = ht.drop('_col_val')

    ht_tmp = new_temp_file()
    ht.write(ht_tmp)

    return hl.read_table(ht_tmp)


@typecheck(ht=Table,
           field=str,
           value=str,
           key=nullable(oneof(str,
                              sequenceof(str))))
def spread(ht, field, value, key=None) -> Table:
    """Spread a key-value pair of fields across multiple fields.

    :func:`.spread` mimics the functionality of the `spread()` function in R's
    `tidyr` package. This is a way to turn "long" format data into "wide"
    format data.

    Given a ``field``, :func:`.spread` will create a new table by grouping
    ``ht`` by its row key and, optionally, any additional fields passed to the
    ``key`` argument.

    After collapsing ``ht`` by these keys, :func:`.spread` creates a new row field
    for each unique value of ``field``, where the row field values are given by the
    corresponding ``value`` in the original ``ht``.