Example #1
0
def hadoop_copy(src, dest):
    """Copy a file through the Hadoop filesystem API.
    Supports distributed file systems like hdfs, gs, and s3.

    Examples
    --------
    
    Copy a file from Google Cloud Storage to a local file:

    >>> hadoop_copy('gs://hail-common/LCR.interval_list',
    ...             'file:///mnt/data/LCR.interval_list') # doctest: +SKIP

    Notes
    ----

    Try using :func:`.hadoop_open` first, it's simpler, but not great
    for large data! For example:

    >>> with hadoop_open('gs://my_bucket/results.csv', 'w') as f: #doctest: +SKIP
    ...     pandas_df.to_csv(f)

    The provided source and destination file paths must be URIs
    (uniform resource identifiers).

    Parameters
    ----------
    src: :obj:`str`
        Source file URI.
    dest: :obj:`str`
        Destination file URI.
    """
    Env.jutils().copyFile(src, dest, Env.hc()._jhc)
Example #2
0
def copy_log(path: str) -> None:
    """Attempt to copy the session log to a hadoop-API-compatible location.

    Examples
    --------
    Specify a manual path:

    >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log')  # doctest: +SKIP
    INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'...

    Copy to a directory:

    >>> hl.copy_log('gs://my-bucket/')  # doctest: +SKIP
    INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'...

    Notes
    -----
    Since Hail cannot currently log directly to distributed file systems, this
    function is provided as a utility for offloading logs from ephemeral nodes.

    If `path` is a directory, then the log file will be copied using its
    base name to the directory (e.g. ``/home/hail.log`` would be copied as
    ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``.

    Parameters
    ----------
    path: :obj:`str`
    """
    Env.fs().copy_log(path)
Example #3
0
def hadoop_ls(path: str) -> List[Dict]:
    """Returns information about files at `path`.

    Notes
    -----
    Raises an error if `path` does not exist.

    If `path` is a file, returns a list with one element. If `path` is a
    directory, returns an element for each file contained in `path` (does not
    search recursively).

    Each dict element of the result list contains the following data:

    - is_dir (:obj:`bool`) -- Path is a directory.
    - size_bytes (:obj:`int`) -- Size in bytes.
    - size (:obj:`str`) -- Size as a readable string.
    - modification_time (:obj:`str`) -- Time of last file modification.
    - owner (:obj:`str`) -- Owner.
    - path (:obj:`str`) -- Path.

    Parameters
    ----------
    path : :obj:`str`

    Returns
    -------
    :obj:`List[Dict]`
    """
    r = Env.jutils().ls(path, Env.hc()._jhc)
    return json.loads(r)
Example #4
0
 def _convert_to_j(self, annotation):
     if annotation is not None:
         return scala_object(Env.hail().annotations, 'Annotation').fromSeq(
             Env.jutils().arrayListToISeq(
                 [t._convert_to_j(annotation.get(f)) for f, t in self.items()]))
     else:
         return None
Example #5
0
def get_reference(name) -> 'hail.ReferenceGenome':
    """Returns the reference genome corresponding to `name`.

    Notes
    -----

    Hail's built-in references are ``'GRCh37'``, ``GRCh38'``, and ``'GRCm38'``.
    The contig names and lengths come from the GATK resource bundle:
    `human_g1k_v37.dict
    <ftp://[email protected]/bundle/b37/human_g1k_v37.dict>`__
    and `Homo_sapiens_assembly38.dict
    <ftp://[email protected]/bundle/hg38/Homo_sapiens_assembly38.dict>`__.


    If ``name='default'``, the value of :func:`.default_reference` is returned.

    Parameters
    ----------
    name : :obj:`str`
        Name of a previously loaded reference genome or one of Hail's built-in
        references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``.

    Returns
    -------
    :class:`.ReferenceGenome`
    """
    Env.hc()
    if name == 'default':
        return default_reference()
    else:
        return ReferenceGenome._references[name]
Example #6
0
def hadoop_stat(path: str) -> Dict:
    """Returns information about the file or directory at a given path.

    Notes
    -----
    Raises an error if `path` does not exist.

    The resulting dictionary contains the following data:

    - is_dir (:obj:`bool`) -- Path is a directory.
    - size_bytes (:obj:`int`) -- Size in bytes.
    - size (:obj:`str`) -- Size as a readable string.
    - modification_time (:obj:`str`) -- Time of last file modification.
    - owner (:obj:`str`) -- Owner.
    - path (:obj:`str`) -- Path.

    Parameters
    ----------
    path : :obj:`str`

    Returns
    -------
    :obj:`Dict`
    """
    return json.loads(Env.jutils().stat(path, Env.hc()._jhc))
Example #7
0
 def __del__(self):
     try:
         Env.hc()._jhc.pyRemoveIrVector(self.jid)
     # there is only so much we can do if the attempt to remove the unused IR fails,
     # especially since this will often get called during interpreter shutdown.
     except Exception:
         pass
Example #8
0
def disable_pipeline_upload():
    """Disable the uploading of pipelines.  By default, pipeline upload is
    disabled.

    """
    
    Env.hc()._jhc.disablePipelineUpload()
Example #9
0
    def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False):
        super(ReferenceGenome, self).__init__()
        
        contigs = wrap_to_list(contigs)
        x_contigs = wrap_to_list(x_contigs)
        y_contigs = wrap_to_list(y_contigs)
        mt_contigs = wrap_to_list(mt_contigs)

        self._config = {
            'name': name,
            'contigs': [{'name': c, 'length': l} for c, l in lengths.items()],
            'xContigs': x_contigs,
            'yContigs': y_contigs,
            'mtContigs': mt_contigs,
            'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par]
        }

        self._contigs = contigs
        self._lengths = lengths
        self._par_tuple = par
        self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par]

        ReferenceGenome._references[name] = self

        if not _builtin:
            Env.backend().add_reference(self._config)

        hl.ir.register_reference_genome_functions(name)

        self._has_sequence = False
        self._liftovers = set()
Example #10
0
    def from_fasta_file(cls, name, fasta_file, index_file,
                        x_contigs=[], y_contigs=[], mt_contigs=[], par=[]):
        """Create reference genome from a FASTA file.
        
        Parameters
        ----------
        name: :obj:`str`
            Name for new reference genome.
        fasta_file : :obj:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`str`
            Path to FASTA index file. Must be uncompressed.
        x_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as X chromosomes.
        y_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as Y chromosomes.
        mt_contigs : :obj:`str` or :obj:`list` of :obj:`str`
            Contigs to be treated as mitochondrial DNA.
        par : :obj:`list` of :obj:`tuple` of (str, int, int)
            List of tuples with (contig, start, end)

        Returns
        -------
        :class:`.ReferenceGenome`
        """
        par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par]
        Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings)
        
        rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True)
        rg._has_sequence = True
        return rg
Example #11
0
def eval_timed(expression):
    """Evaluate a Hail expression, returning the result and the times taken for
    each stage in the evaluation process.

    Parameters
    ----------
    expression : :class:`.Expression`
        Any expression, or a Python value that can be implicitly interpreted as an expression.

    Returns
    -------
    (Any, dict)
        Result of evaluating `expression` and a dictionary of the timings
    """
    from hail.utils.java import Env

    analyze('eval_timed', expression, Indices(expression._indices.source))

    if expression._indices.source is None:
        ir_type = expression._ir.typ
        expression_type = expression.dtype
        if ir_type != expression.dtype:
            raise ExpressionException(f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}')
        return Env.backend().execute(expression._ir, True)
    else:
        uid = Env.get_uid()
        ir = expression._indices.source.select_globals(**{uid: expression}).index_globals()[uid]._ir
        return Env.backend().execute(ir, True)
Example #12
0
    def test_seeding_is_consistent(self):
        hl.set_global_seed(0)
        a = [Env.next_seed() for _ in range(10)]
        hl.set_global_seed(0)
        b = [Env.next_seed() for _ in range(10)]

        self.assertEqual(len(set(a)), 10)
        self.assertEqual(a, b)
Example #13
0
def blocking_execute(code):
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    typ = hl.dtype(jir.typ().toString())
    result = Env.hail().backend.spark.SparkBackend.executeJSON(jir)
    return {
        'type': str(typ),
        'result': result
    }
Example #14
0
    def remove_sequence(self):
        """Remove the reference sequence.

        Returns
        -------
        :obj:`bool`
        """
        self._has_sequence = False
        Env.backend().remove_sequence(self.name)
Example #15
0
    def remove_liftover(self, dest_reference_genome):
        """Remove liftover to `dest_reference_genome`.

        Parameters
        ----------
        dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome`
        """
        if dest_reference_genome.name in self._liftovers:
            self._liftovers.remove(dest_reference_genome.name)
            Env.backend().remove_liftover(self.name, dest_reference_genome.name)
Example #16
0
File: fs.py Project: jigold/hail
 def copy_log(self, path: str) -> None:
     log = Env.hc()._log
     try:
         if self.is_dir(path):
             _, tail = os.path.split(log)
             path = os.path.join(path, tail)
         info(f"copying log to {repr(path)}...")
         self.copy(local_path_uri(Env.hc()._log), path)
     except Exception as e:
         sys.stderr.write(f'Could not copy log: encountered error:\n  {e}')
Example #17
0
def upload_log():
    """Uploads the Hail log to the Hail team.

    Warning
    -------
    Shares potentially sensitive data with the Hail team.

    """

    Env.hc()._jhc.uploadLog()
Example #18
0
 def stop(self):
     Env.hail().HailContext.clear()
     self.sc.stop()
     self.sc = None
     Env._jvm = None
     Env._gateway = None
     Env._hc = None
     uninstall_exception_handler()
     Env._dummy_table = None
     Env._seed_generator = None
Example #19
0
def set_global_seed(seed):
    """Sets Hail's global seed to `seed`.

    Parameters
    ----------
    seed : :obj:`int`
        Integer used to seed Hail's random number generator
    """

    Env.set_seed(seed)
Example #20
0
def _set_flags(**flags):
    available = set(Env.hc()._jhc.flags().available())
    invalid = []
    for flag, value in flags.items():
        if flag in available:
            Env.hc()._jhc.flags().set(flag, value)
        else:
            invalid.append(flag)
    if len(invalid) != 0:
        raise FatalError("Flags {} not valid. Valid flags: \n    {}"
                         .format(', '.join(invalid), '\n    '.join(available)))
Example #21
0
def hadoop_exists(path: str) -> bool:
    """Returns ``True`` if `path` exists.

    Parameters
    ----------
    path : :obj:`str`

    Returns
    -------
    :obj:`.bool`
    """
    return Env.jutils().exists(path, Env.hc()._jhc)
Example #22
0
def hadoop_is_file(path: str) -> bool:
    """Returns ``True`` if `path` both exists and is a file.

    Parameters
    ----------
    path : :obj:`str`

    Returns
    -------
    :obj:`.bool`
    """
    return Env.jutils().isFile(path, Env.hc()._jhc)
Example #23
0
def hadoop_is_dir(path) -> bool:
    """Returns ``True`` if `path` both exists and is a directory.

    Parameters
    ----------
    path : :obj:`str`

    Returns
    -------
    :obj:`.bool`
    """
    return Env.jutils().isDir(path, Env.hc()._jhc)
Example #24
0
 def stop(self):
     Env.hail().HailContext.clear()
     self.sc.stop()
     self.sc = None
     Env._jvm = None
     Env._gateway = None
     Env._hc = None
     uninstall_exception_handler()
     Env._dummy_table = None
     Env._seed_generator = None
     hail.ir.clear_session_functions()
     ReferenceGenome._references = {}
Example #25
0
    def add_sequence(self, fasta_file, index_file=None):
        """Load the reference sequence from a FASTA file.

        Examples
        --------
        Access the GRCh37 reference genome using :func:`.get_reference`:

        >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP

        Add a sequence file:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz',
        ...                 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP

        Add a sequence file with the default index location:

        >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP


        Notes
        -----
        This method can only be run once per reference genome. Use
        :meth:`~has_sequence` to test whether a sequence is loaded.

        FASTA and index files are hosted on google cloud for some of Hail's built-in
        references:

        **GRCh37**

        - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz``
        - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai``

        **GRCh38**

        - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz``
        - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai``

        Public download links are available
        `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__.

        Parameters
        ----------
        fasta_file : :obj:`str`
            Path to FASTA file. Can be compressed (GZIP) or uncompressed.
        index_file : :obj:`None` or :obj:`str`
            Path to FASTA index file. Must be uncompressed. If `None`, replace
            the fasta_file's extension with `fai`.
        """
        if index_file is None:
            index_file = re.sub('\.[^.]*$', '.fai', fasta_file)
        Env.backend().add_sequence(self.name, fasta_file, index_file)
        self._has_sequence = True
Example #26
0
 def test_parses(self):
     env = {'c': hl.tbool,
            'a': hl.tarray(hl.tint32),
            'aa': hl.tarray(hl.tarray(hl.tint32)),
            'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)),
            'v': hl.tint32,
            's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64),
            't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64),
            'call': hl.tcall,
            'x': hl.tint32}
     env = {name: t._jtype for name, t in env.items()}
     for x in self.value_irs():
         Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
Example #27
0
    def test_matrix_ir_parses(self):
        hl.index_bgen(resource('example.8bits.bgen'),
                      reference_genome=hail.get_reference('GRCh37'),
                      contig_recoding={'01': '1'})

        collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32])
        collect = ir.MakeStruct([('x', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))])

        matrix_read = ir.MatrixRead(
            resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False, False)
        table_read = ir.TableRead(resource('backward_compatability/1.0.0/table/0.ht'), False, None)

        matrix_irs = [
            ir.MatrixRepartition(ir.MatrixRange(5, 5, 1), 100, True),
            ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1), ir.MatrixRange(5, 5, 1)),
            ir.MatrixDistinctByRow(ir.MatrixRange(5, 5, 1)),
            ir.CastTableToMatrix(
                ir.CastMatrixToTable(matrix_read, '__entries', '__cols'),
                '__entries',
                '__cols',
                []),
            ir.MatrixAggregateRowsByKey(matrix_read, collect, collect),
            ir.MatrixAggregateColsByKey(matrix_read, collect, collect),
            ir.MatrixRange(1, 1, 10),
            ir.MatrixImportVCF([resource('sample.vcf')], False, False, None, None, False, ['GT'],
                               hail.get_reference('GRCh37'), {}, True, False),
            ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'], resource('example.sample'), {}, 10, 1,
                                ['varid'], None),
            ir.MatrixFilterRows(matrix_read, ir.FalseIR()),
            ir.MatrixFilterCols(matrix_read, ir.FalseIR()),
            ir.MatrixFilterEntries(matrix_read, ir.FalseIR()),
            ir.MatrixChooseCols(matrix_read, [1, 0]),
            ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']),
            ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False),
            ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])),
            ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100),
            ir.MatrixCollectColsByKey(matrix_read),
            ir.MatrixExplodeRows(matrix_read, ['row_aset']),
            ir.MatrixExplodeCols(matrix_read, ['col_aset']),
            ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None),
            ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'),
        ]

        for x in matrix_irs:
            try:
                Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x))
            except Exception as e:
                raise ValueError(str(x)) from e
Example #28
0
def set_global_seed(seed):
    """Sets Hail's global seed to `seed`.

    Parameters
    ----------
    seed : :obj:`int`
        Integer used to seed Hail's random number generator

    Returns
    -------
    :class:`.ReferenceGenome`
    """

    Env.set_seed(seed)
Example #29
0
def enable_pipeline_upload():
    """Upload all subsequent pipelines to the Hail team in order to
    help improve Hail.
    
    Pipeline upload can also be enabled by setting the environment
    variable `HAIL_ENABLE_PIPELINE_UPLOAD` or the Spark configuration
    property `hail.enablePipelineUpload` to `true`.

    Warning
    -------
    Shares potentially sensitive data with the Hail team.

    """
    
    Env.hc()._jhc.enablePipelineUpload()
Example #30
0
 def localize_entries_with_none_entries_changes_no_rows(self):
     mt = hl.utils.range_matrix_table(10, 10)
     mt = mt.select_entries(x = mt.row_idx * mt.col_idx)
     localized = mt.localize_entries(entries_array_field_name=None,
                                     columns_array_field_name=Env.get_uid())
     rows_table = mt.rows()
     assert rows_table.collect() == localized.collect()
Example #31
0
File: ldscsim.py Project: zscu/hail
def multitrait_inf(mt, h2=None, rg=None, cov_matrix=None, seed=None):
    """Generates correlated betas for multi-trait infinitesimal simulations for 
    any number of phenotypes.
    
    Parameters
    ----------
    mt : :class:`.MatrixTable`
        MatrixTable for simulated phenotype.
    h2 : :obj:`float` or :obj:`int` or :obj:`list`, optional
        Desired SNP-based heritability (:math:`h^2`) of simulated traits. 
        If `h2` is ``None``, :math:`h^2` is based on diagonal of `cov_matrix`.
    rg : :obj:`float` or :obj:`int` or :obj:`list`, optional
        Desired genetic correlation (:math:`r_g`) between simulated traits. 
        If simulating more than two correlated traits, `rg` should be a list 
        of :math:`rg` values corresponding to the upper right triangle of the 
        covariance matrix. If `rg` is ``None`` and `cov_matrix` is ``None``, :math:`r_g` 
        is assumed to be 0 between traits. If `rg` and `cov_matrix` are both
        not None, :math:`r_g` values from `cov_matrix` take precedence.
    cov_matrix : :class:`numpy.ndarray`, optional
        Covariance matrix for traits, **unscaled by :math:`M`**, the number of SNPs. 
        Overrides `h2` and `rg` even when `h2` or `rg` are not ``None``.
    seed : :obj:`int`, optional
        Seed for random number generator. If `seed` is ``None``, `seed` is set randomly.
    
    Returns
    -------
    :class:`.MatrixTable`
        :class:`.MatrixTable` with simulated SNP effects as a row field of arrays.
    """
    tid = ''.join(
        random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5)
    )  # "temporary id" -- random string to identify temporary intermediate fields generated by this method
    h2 = [h2] if type(h2) is not list else h2
    rg = [rg] if type(rg) is not list else rg
    assert (all(x >= 0 and x <= 1
                for x in h2)), 'h2 values must be between 0 and 1'
    assert h2 is not [
        None
    ] or cov_matrix is not None, 'h2 and cov_matrix cannot both be None'
    seed = seed if seed is not None else int(str(Env.next_seed())[:8])
    M = mt.count_rows()
    if cov_matrix != None:
        n_phens = cov_matrix.shape[0]
    else:
        n_phens = len(h2)
        if rg == [None]:
            print(f'Assuming rg=0 for all {n_phens} traits')
            rg = [0] * int((n_phens**2 - n_phens) / 2)
        assert (all(x >= -1 and x <= 1
                    for x in rg)), 'rg values must be between 0 and 1'
        cov_matrix = create_cov_matrix(h2, rg)
    cov_matrix = (1 / M) * cov_matrix
    randstate = np.random.RandomState(
        int(seed))  #seed random state for replicability
    betas = randstate.multivariate_normal(mean=np.zeros(n_phens),
                                          cov=cov_matrix,
                                          size=[
                                              M,
                                          ])
    df = pd.DataFrame([0] * M, columns=['beta'])
    tb = hl.Table.from_pandas(df)
    tb = tb.add_index().key_by('idx')
    tb = tb.annotate(beta=hl.literal(betas.tolist())[hl.int32(tb.idx)])
    mt = mt.add_row_index(name='row_idx' + tid)
    mt = mt.annotate_rows(beta=tb[mt['row_idx' + tid]]['beta'])
    mt = _clean_fields(mt, tid)
    return mt
Example #32
0
def vep(dataset: Union[Table, MatrixTable],
        config,
        block_size=1000,
        name='vep',
        csq=False):
    """Annotate variants with VEP.

    .. include:: ../_templates/req_tvariant.rst

    :func:`.vep` runs `Variant Effect Predictor
    <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ on the
    current dataset and adds the result as a row field.

    Examples
    --------

    Add VEP annotations to the dataset:

    >>> result = hl.vep(dataset, "data/vep-configuration.json") # doctest: +SKIP

    Notes
    -----

    **Configuration**

    :func:`.vep` needs a configuration file to tell it how to run VEP.
    The format of the configuration file is JSON, and :func:`.vep`
    expects a JSON object with three fields:

    - `command` (array of string) -- The VEP command line to run.  The string literal `__OUTPUT_FORMAT_FLAG__` is replaced with `--json` or `--vcf` depending on `csq`.
    - `env` (object) -- A map of environment variables to values to add to the environment when invoking the command.  The value of each object member must be a string.
    - `vep_json_schema` (string): The type of the VEP JSON schema (as produced by the VEP when invoked with the `--json` option).  Note: This is the old-style 'parseable' Hail type syntax.  This will change.

    Here is an example configuration file for invoking VEP release 85
    installed in `/vep` with the Loftee plugin:

    .. code-block:: text

        {
        	"command": [
        		"/vep",
        		"--format", "vcf",
        		"__OUTPUT_FORMAT_FLAG__",
        		"--everything",
        		"--allele_number",
        		"--no_stats",
        		"--cache", "--offline",
        		"--minimal",
        		"--assembly", "GRCh37",
        		"--plugin", "LoF,human_ancestor_fa:/root/.vep/loftee_data/human_ancestor.fa.gz,filter_position:0.05,min_intron_size:15,conservation_file:/root/.vep/loftee_data/phylocsf_gerp.sql,gerp_file:/root/.vep/loftee_data/GERP_scores.final.sorted.txt.gz",
        		"-o", "STDOUT"
        	],
        	"env": {
        		"PERL5LIB": "/vep_data/loftee"
        	},
        	"vep_json_schema": "Struct{assembly_name:String,allele_string:String,ancestral:String,colocated_variants:Array[Struct{aa_allele:String,aa_maf:Float64,afr_allele:String,afr_maf:Float64,allele_string:String,amr_allele:String,amr_maf:Float64,clin_sig:Array[String],end:Int32,eas_allele:String,eas_maf:Float64,ea_allele:String,ea_maf:Float64,eur_allele:String,eur_maf:Float64,exac_adj_allele:String,exac_adj_maf:Float64,exac_allele:String,exac_afr_allele:String,exac_afr_maf:Float64,exac_amr_allele:String,exac_amr_maf:Float64,exac_eas_allele:String,exac_eas_maf:Float64,exac_fin_allele:String,exac_fin_maf:Float64,exac_maf:Float64,exac_nfe_allele:String,exac_nfe_maf:Float64,exac_oth_allele:String,exac_oth_maf:Float64,exac_sas_allele:String,exac_sas_maf:Float64,id:String,minor_allele:String,minor_allele_freq:Float64,phenotype_or_disease:Int32,pubmed:Array[Int32],sas_allele:String,sas_maf:Float64,somatic:Int32,start:Int32,strand:Int32}],context:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],impact:String,minimised:Int32,variant_allele:String}],most_severe_consequence:String,motif_feature_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],high_inf_pos:String,impact:String,minimised:Int32,motif_feature_id:String,motif_name:String,motif_pos:Int32,motif_score_change:Float64,strand:Int32,variant_allele:String}],regulatory_feature_consequences:Array[Struct{allele_num:Int32,biotype:String,consequence_terms:Array[String],impact:String,minimised:Int32,regulatory_feature_id:String,variant_allele:String}],seq_region_name:String,start:Int32,strand:Int32,transcript_consequences:Array[Struct{allele_num:Int32,amino_acids:String,biotype:String,canonical:Int32,ccds:String,cdna_start:Int32,cdna_end:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String],distance:Int32,domains:Array[Struct{db:String,name:String}],exon:String,gene_id:String,gene_pheno:Int32,gene_symbol:String,gene_symbol_source:String,hgnc_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact:String,intron:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,minimised:Int32,polyphen_prediction:String,polyphen_score:Float64,protein_end:Int32,protein_start:Int32,protein_id:String,sift_prediction:String,sift_score:Float64,strand:Int32,swissprot:String,transcript_id:String,trembl:String,uniparc:String,variant_allele:String}],variant_class:String}"
        }

    **Annotations**

    A new row field is added in the location specified by `name` with type given
    by the type given by the `json_vep_schema` (if `csq` is ``False``) or
    :py:data:`.tstr` (if `csq` is ``True``).

    If csq is ``True``, then the CSQ header string is also added as a global
    field with name ``name + '_csq_header'``.

    Parameters
    ----------
    dataset : :class:`.MatrixTable` or :class:`.Table`
        Dataset.
    config : :obj:`str`
        Path to VEP configuration file.
    block_size : :obj:`int`
        Number of rows to process per VEP invocation.
    name : :obj:`str`
        Name for resulting row field.
    csq : :obj:`bool`
        If ``True``, annotates with the VCF CSQ field as a :py:data:`.tstr`.
        If ``False``, annotates as the `vep_json_schema`.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`
        Dataset with new row-indexed field `name` containing VEP annotations.

    """
    if isinstance(dataset, MatrixTable):
        require_row_key_variant(dataset, 'vep')
        ht = dataset.select_rows().rows()
    else:
        require_table_key_variant(dataset, 'vep')
        ht = dataset.select()

    annotations = Table(Env.hail().methods.VEP.apply(ht._jt, config, csq,
                                                     block_size))

    if csq:
        dataset = dataset.annotate_globals(**{
            name + '_csq_header':
            annotations.index_globals()['vep_csq_header']
        })

    if isinstance(dataset, MatrixTable):
        return dataset.annotate_rows(
            **{name: annotations[dataset.row_key].vep})
    else:
        return dataset.annotate(**{name: annotations[dataset.key].vep})
Example #33
0
 def __init__(self, path, buffer_size):
     self._jfile = Env.jutils().readFile(path, Env.hc()._jhc, buffer_size)
     super(HadoopReader, self).__init__()
Example #34
0
def nirvana(dataset: Union[MatrixTable, Table],
            config,
            block_size=500000,
            name='nirvana'):
    """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_.

    .. include:: ../_templates/experimental.rst

    .. include:: ../_templates/req_tvariant.rst

    :func:`.nirvana` runs `Nirvana
    <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a
    new row field in the location specified by `name`.

    Examples
    --------

    Add Nirvana annotations to the dataset:

    >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP

    **Configuration**

    :func:`.nirvana` requires a configuration file. The format is a
    `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each
    line defines a property as a key-value pair of the form ``key = value``.
    :func:`.nirvana` supports the following properties:

    - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet.
    - **hail.nirvana.path** -- Value of the PATH environment variable when
      invoking Nirvana. Optional, by default PATH is not set.
    - **hail.nirvana.location** -- Location of Nirvana.dll. Required.
    - **hail.nirvana.reference** -- Location of reference genome. Required.
    - **hail.nirvana.cache** -- Location of cache. Required.
    - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of
      Supplementary Database. Optional, no supplementary database by default.

    Here is an example ``nirvana.properties`` configuration file:

    .. code-block:: text

        hail.nirvana.location = /path/to/dotnet/netcoreapp2.0/Nirvana.dll
        hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat
        hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl
        hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37

    **Annotations**

    A new row field is added in the location specified by `name` with the
    following schema:

    .. code-block:: text

        struct {
            chromosome: str,
            refAllele: str,
            position: int32,
            altAlleles: array<str>,
            cytogeneticBand: str,
            quality: float64,
            filters: array<str>,
            jointSomaticNormalQuality: int32,
            copyNumber: int32,
            strandBias: float64,
            recalibratedQuality: float64,
            variants: array<struct {
                altAllele: str,
                refAllele: str,
                chromosome: str,
                begin: int32,
                end: int32,
                phylopScore: float64,
                isReferenceMinor: bool,
                variantType: str,
                vid: str,
                hgvsg: str,
                isRecomposedVariant: bool,
                isDecomposedVariant: bool,
                regulatoryRegions: array<struct {
                    id: str,
                    type: str,
                    consequence: set<str>
                }>,
                clinvar: array<struct {
                    id: str,
                    reviewStatus: str,
                    isAlleleSpecific: bool,
                    alleleOrigins: array<str>,
                    refAllele: str,
                    altAllele: str,
                    phenotypes: array<str>,
                    medGenIds: array<str>,
                    omimIds: array<str>,
                    orphanetIds: array<str>,
                    significance: str,
                    lastUpdatedDate: str,
                    pubMedIds: array<str>
                }>,
                cosmic: array<struct {
                    id: str,
                    isAlleleSpecific: bool,
                    refAllele: str,
                    altAllele: str,
                    gene: str,
                    sampleCount: int32,
                    studies: array<struct {
                        id: int32,
                        histology: str,
                        primarySite: str
                    }>
                }>,
                dbsnp: struct {
                    ids: array<str>
                },
                globalAllele: struct {
                    globalMinorAllele: str,
                    globalMinorAlleleFrequency: float64
                },
                gnomad: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    failedFilter: bool
                },
                gnomadExome: struct {
                    coverage: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    allHc: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    afrHc: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    amrHc: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    easHc: int32,
                    finAf: float64,
                    finAc: int32,
                    finAn: int32,
                    finHc: int32,
                    nfeAf: float64,
                    nfeAc: int32,
                    nfeAn: int32,
                    nfeHc: int32,
                    othAf: float64,
                    othAc: int32,
                    othAn: int32,
                    othHc: int32,
                    asjAf: float64,
                    asjAc: int32,
                    asjAn: int32,
                    asjHc: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32,
                    sasHc: int32,
                    failedFilter: bool
                },
                topmed: struct {
                    failedFilter: bool,
                    allAc: int32,
                    allAn: int32,
                    allAf: float64,
                    allHc: int32
                },
                oneKg: struct {
                    ancestralAllele: str,
                    allAf: float64,
                    allAc: int32,
                    allAn: int32,
                    afrAf: float64,
                    afrAc: int32,
                    afrAn: int32,
                    amrAf: float64,
                    amrAc: int32,
                    amrAn: int32,
                    easAf: float64,
                    easAc: int32,
                    easAn: int32,
                    eurAf: float64,
                    eurAc: int32,
                    eurAn: int32,
                    sasAf: float64,
                    sasAc: int32,
                    sasAn: int32
                },
                mitomap: array<struct {
                    refAllele: str,
                    altAllele: str,
                    diseases : array<str>,
                    hasHomoplasmy: bool,
                    hasHeteroplasmy: bool,
                    status: str,
                    clinicalSignificance: str,
                    scorePercentile: float64,
                    isAlleleSpecific: bool,
                    chromosome: str,
                    begin: int32,
                    end: int32,
                    variantType: str
                }
                transcripts: struct {
                    refSeq: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    }>,
                    ensembl: array<struct {
                        transcript: str,
                        bioType: str,
                        aminoAcids: str,
                        cdnaPos: str,
                        codons: str,
                        cdsPos: str,
                        exons: str,
                        introns: str,
                        geneId: str,
                        hgnc: str,
                        consequence: array<str>,
                        hgvsc: str,
                        hgvsp: str,
                        isCanonical: bool,
                        polyPhenScore: float64,
                        polyPhenPrediction: str,
                        proteinId: str,
                        proteinPos: str,
                        siftScore: float64,
                        siftPrediction: str
                    }>
                },
                overlappingGenes: array<str>
            }>
            genes: array<struct {
                name: str,
                omim: array<struct {
                    mimNumber: int32,
                    hgnc: str,
                    description: str,
                    phenotypes: array<struct {
                        mimNumber: int32,
                        phenotype: str,
                        mapping: str,
                        inheritance: array<str>,
                        comments: str
                    }>
                }>
                exac: struct {
                    pLi: float64,
                    pRec: float64,
                    pNull: float64
                }
            }>
        }

    Parameters
    ----------
    dataset : :class:`.MatrixTable` or :class:`.Table`
        Dataset.
    config : :obj:`str`
        Path to Nirvana configuration file.
    block_size : :obj:`int`
        Number of rows to process per Nirvana invocation.
    name : :obj:`str`
        Name for resulting row field.

    Returns
    -------
    :class:`.MatrixTable` or :class:`.Table`
        Dataset with new row-indexed field `name` containing Nirvana annotations.
    """
    if isinstance(dataset, MatrixTable):
        require_row_key_variant(dataset, 'nirvana')
        ht = dataset.select_rows().rows()
    else:
        require_table_key_variant(dataset, 'nirvana')
        ht = dataset.select()

    annotations = Table(Env.hail().methods.Nirvana.apply(
        ht._jt, config, block_size))

    if isinstance(dataset, MatrixTable):
        return dataset.annotate_rows(
            **{name: annotations[dataset.row_key].nirvana})
    else:
        return dataset.annotate(**{name: annotations[dataset.key].nirvana})
Example #35
0
def stop():
    """Stop the currently running Hail session."""
    if Env._hc:
        Env.hc().stop()
Example #36
0
 def parse(self, code, ref_map={}, ir_map={}):
     return Env.hail().expr.ir.IRParser.parse_value_ir(
         code, {k: t._parsable_string()
                for k, t in ref_map.items()}, ir_map)
Example #37
0
def loop(f: Callable, typ, *args):
    r"""Define and call a tail-recursive function with given arguments.

    Notes
    -----
    The argument `f` must be a function where the first argument defines the
    recursive call, and the remaining arguments are the arguments to the
    recursive function, e.g. to define the recursive function

    .. math::

        f(x, y) = \begin{cases}
        y & \textrm{if } x \equiv 0 \\
        f(x - 1, y + x) & \textrm{otherwise}
        \end{cases}


    we would write:
    >>> f = lambda recur, x, y: hl.if_else(x == 0, y, recur(x - 1, y + x))

    Full recursion is not supported, and any non-tail-recursive methods will
    throw an error when called.

    This means that the result of any recursive call within the function must
    also be the result of the entire function, without modification. Let's
    consider two different recursive definitions for the triangle function
    :math:`f(x) = 0 + 1 + \dots + x`:

    >>> def triangle1(x):
    ...     if x == 1:
    ...         return x
    ...     return x + triangle1(x - 1)

    >>> def triangle2(x, total):
    ...     if x == 0:
    ...         return total
    ...     return triangle2(x - 1, total + x)

    The first function definition, `triangle1`, will call itself and then add x.
    This is an example of a non-tail recursive function, since `triangle1(9)`
    needs to modify the result of the inner recursive call to `triangle1(8)` by
    adding 9 to the result.

    The second function is tail recursive: the result of `triangle2(9, 0)` is
    the same as the result of the inner recursive call, `triangle2(8, 9)`.

    Example
    -------
    To find the sum of all the numbers from n=1...10:
    >>> triangle_f = lambda f, x, total: hl.if_else(x == 0, total, f(x - 1, total + x))
    >>> x = hl.experimental.loop(triangle_f, hl.tint32, 10, 0)
    >>> hl.eval(x)
    55

    Let's say we want to find the root of a polynomial equation:
    >>> def polynomial(x):
    ...     return 5 * x**3 - 2 * x - 1

    We'll use `Newton's method<https://en.wikipedia.org/wiki/Newton%27s_method>`
    to find it, so we'll also define the derivative:

    >>> def derivative(x):
    ...     return 15 * x**2 - 2

    and starting at :math:`x_0 = 0`, we'll compute the next step :math:`x_{i+1} = x_i - \frac{f(x_i)}{f'(x_i)}`
    until the difference between :math:`x_{i}` and :math:`x_{i+1}` falls below
    our convergence threshold:

    >>> threshold = 0.005
    >>> def find_root(f, guess, error):
    ...     converged = hl.is_defined(error) & (error < threshold)
    ...     new_guess = guess - (polynomial(guess) / derivative(guess))
    ...     new_error = hl.abs(new_guess - guess)
    ...     return hl.if_else(converged, guess, f(new_guess, new_error))
    >>> x = hl.experimental.loop(find_root, hl.tfloat, 0.0, hl.missing(hl.tfloat))
    >>> hl.eval(x)
    0.8052291984599675

    Warning
    -------
    Using arguments of a type other than numeric types and booleans can cause
    memory issues if if you expect the recursive call to happen many times.

    Parameters
    ----------
    f : function ( (marker, \*args) -> :class:`.Expression`
        Function of one callable marker, denoting where the recursive call (or calls) is located,
        and many `args`, the loop variables.
    typ : :class:`str` or :class:`.HailType`
        Type the loop returns.
    args : variable-length args of :class:`.Expression`
        Expressions to initialize the loop values.
    Returns
    -------
    :class:`.Expression`
        Result of the loop with `args` as initial loop values.
    """

    loop_name = Env.get_uid()

    def contains_recursive_call(non_recursive):
        if isinstance(non_recursive,
                      ir.Recur) and non_recursive.name == loop_name:
            return True
        return any(
            [contains_recursive_call(c) for c in non_recursive.children])

    def check_tail_recursive(loop_ir):
        if isinstance(loop_ir, ir.If):
            if contains_recursive_call(loop_ir.cond):
                raise TypeError(
                    "branch condition can't contain recursive call!")
            check_tail_recursive(loop_ir.cnsq)
            check_tail_recursive(loop_ir.altr)
        elif isinstance(loop_ir, ir.Let):
            if contains_recursive_call(loop_ir.value):
                raise TypeError(
                    "bound value used in other expression can't contain recursive call!"
                )
            check_tail_recursive(loop_ir.body)
        elif isinstance(loop_ir, ir.TailLoop):
            if any(contains_recursive_call(x) for n, x in loop_ir.params):
                raise TypeError(
                    "parameters passed to inner loop can't contain recursive call!"
                )
        elif not isinstance(loop_ir,
                            ir.Recur) and contains_recursive_call(loop_ir):
            raise TypeError(
                "found recursive expression outside of tail position!")

    @typecheck(recur_exprs=expr_any)
    def make_loop(*recur_exprs):
        if len(recur_exprs) != len(args):
            raise TypeError(
                'Recursive call in loop has wrong number of arguments')
        err = None
        for i, (rexpr, expr) in enumerate(zip(recur_exprs, args)):
            if rexpr.dtype != expr.dtype:
                if err is None:
                    err = 'Type error in recursive call,'
                err += f'\n  at argument index {i}, loop arg type: {expr.dtype}, '
                err += f'recur arg type: {rexpr.dtype}'
        if err is not None:
            raise TypeError(err)
        irs = [expr._ir for expr in recur_exprs]
        indices, aggregations = unify_all(*recur_exprs)
        return construct_expr(ir.Recur(loop_name, irs, typ), typ, indices,
                              aggregations)

    uid_irs = []
    loop_vars = []

    for expr in args:
        uid = Env.get_uid()
        loop_vars.append(
            construct_variable(uid, expr._type, expr._indices,
                               expr._aggregations))
        uid_irs.append((uid, expr._ir))

    loop_f = to_expr(f(make_loop, *loop_vars))
    if loop_f.dtype != typ:
        raise TypeError(
            f"requested type {typ} does not match inferred type {loop_f.dtype}"
        )
    check_tail_recursive(loop_f._ir)
    indices, aggregations = unify_all(*args, loop_f)

    return construct_expr(ir.TailLoop(loop_name, loop_f._ir, uid_irs),
                          loop_f.dtype, indices, aggregations)
Example #38
0
 def __init__(self, path, exclusive=False):
     self._jfile = Env.jutils().writeFile(path, Env.hc()._jhc, exclusive)
     super(HadoopWriter, self).__init__()
Example #39
0
 def copy(self, src: str, dest: str):
     Env.jutils().copyFile(src, dest, Env.hc()._jhc)
Example #40
0
 def parse(self, code, ref_map={}, ir_map={}):
     return Env.hail().expr.ir.IRParser.parse_blockmatrix_ir(
         code, ref_map, ir_map)
Example #41
0
 def parse(self, code, ref_map={}, ir_map={}):
     return Env.hail().expr.ir.IRParser.parse_table_ir(
         code, ref_map, ir_map)
Example #42
0
    def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None):
        r"""Fit and test alternative model for each augmented design matrix in parallel.

        Notes
        -----
        The alternative model is fit using REML constrained to the value of
        :math:`\gamma` set by :meth:`fit`.

        The likelihood ratio test of fixed effect parameter :math:`\beta_\star`
        uses (non-restricted) maximum likelihood:

        .. math::

          \chi^2 = 2 \log\left(\frac{
          \max_{\beta_\star, \beta, \sigma^2}\mathrm{N}
          (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)}
          {\max_{\beta, \sigma^2} \mathrm{N}
          (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)}
          \right)

        The p-value is given by the tail probability under a chi-squared
        distribution with one degree of freedom.

        The resulting table has the following fields:

        .. list-table::
          :header-rows: 1

          * - Field
            - Type
            - Value
          * - `idx`
            - int64
            - Index of augmented design matrix.
          * - `beta`
            - float64
            - :math:`\beta_\star`
          * - `sigma_sq`
            - float64
            - :math:`\sigma^2`
          * - `chi_sq`
            - float64
            - :math:`\chi^2`
          * - `p_value`
            - float64
            - p-value

        :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number
        of rows (augmentations). These rows are grouped into partitions for
        parallel processing. The number of partitions equals the ceiling of
        ``n_rows / partition_size``, and should be at least the number or cores
        to make use of all cores. By default, there is one partition per row of
        blocks in :math:`(P_r A)^T`. Setting the partition size to an exact
        (rather than approximate) divisor or multiple of the block size reduces
        superfluous shuffling of data.

        The number of columns in each block matrix must be less than :math:`2^{31}`.

        Warning
        -------
        The block matrices must be stored in row-major format, as results
        from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from
        :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method
        will produce an error message.

        Parameters
        ----------
        pa_t_path: :obj:`str`
            Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`.
            Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`.
        a_t_path: :obj:`str`, optional
            Path to block matrix :math:`A^T` with shape :math:`(m, n)`.
            Each row is an augmentation :math:`x_\star` of :math:`X`.
            Include for low-rank inference.
        partition_size: :obj:`int`, optional
            Number of rows to process per partition.
            Default given by block size of :math:`(P_r A)^T`.

        Returns
        -------
        :class:`.Table`
            Table of results for each augmented design matrix.
        """
        from hail.table import Table

        self._check_dof(self.f + 1)

        if self.low_rank and a_t_path is None:
            raise ValueError('model is low-rank so a_t is required.')
        elif not (self.low_rank or a_t_path is None):
            raise ValueError('model is full-rank so a_t must not be set.')

        if self._scala_model is None:
            self._set_scala_model()

        if partition_size is None:
            block_size = Env.hail().linalg.BlockMatrix.readMetadata(
                Env.hc()._jhc, pa_t_path).blockSize()
            partition_size = block_size
        elif partition_size <= 0:
            raise ValueError(
                f'partition_size must be positive, found {partition_size}')

        jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix(
            Env.hc()._jhc, pa_t_path, jsome(partition_size))

        if a_t_path is None:
            maybe_ja_t = jnone()
        else:
            maybe_ja_t = jsome(Env.hail().linalg.RowMatrix.readBlockMatrix(
                Env.hc()._jhc, a_t_path, jsome(partition_size)))

        return Table._from_java(self._scala_model.fit(jpa_t, maybe_ja_t))
Example #43
0
def _get_flags(*flags):
    return {flag: Env.backend()._jhc.flags().get(flag) for flag in flags}
Example #44
0
def current_backend():
    return Env.hc()._backend
Example #45
0
def uri_path(uri):
    return Env.jutils().uriPath(uri)
Example #46
0
 def exists(self, path: str) -> bool:
     return Env.jutils().exists(path, Env.hc()._jhc)
Example #47
0
def new_temp_file(suffix=None, prefix=None, n_char=10):
    return Env.hc()._jhc.getTemporaryFile(n_char, joption(prefix),
                                          joption(suffix))
Example #48
0
def require_biallelic(dataset, method) -> MatrixTable:
    require_row_key_variant(dataset, method)
    dataset = MatrixTable(Env.hail().methods.VerifyBiallelic.apply(
        dataset._jvds, method))
    return dataset
Example #49
0
 def to_hql(self):
     return '("{regex}" ~ {string})'.format(
         regex=Env.jutils().escapePyString(self.regex),
         string=self.string.to_hql())
Example #50
0
 def test_type_jvm_roundtrip(self):
     ts = self.types_to_test()
     for t in ts:
         rev_str = t._parsable_string()
         jtyp = Env.hail().expr.ir.IRParser.parseType(rev_str)
         self.assertEqual(t, dtype(jtyp.toString()))
Example #51
0
def filter_intervals(ds, intervals, keep=True) -> MatrixTable:
    """Filter rows with a list of intervals.

    Examples
    --------

    Filter to loci falling within one interval:

    >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')])

    Remove all loci within list of intervals:

    >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']]
    >>> ds_result = hl.filter_intervals(dataset, intervals)

    Notes
    -----
    Based on the ``keep`` argument, this method will either restrict to points
    in the supplied interval ranges, or remove all rows in those ranges.

    When ``keep=True``, partitions that don't overlap any supplied interval
    will not be loaded at all.  This enables :func:`.filter_intervals` to be
    used for reasonably low-latency queries of small ranges of the dataset, even
    on large datasets.

    Parameters
    ----------
    ds : :class:`.MatrixTable`
        Dataset.
    intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval`
        Intervals to filter on. If there is only one row partition key, the
        point type of the interval can be the type of the first partition key.
        Otherwise, the interval point type must be a :class:`.Struct` matching
        the row partition key schema.
    keep : :obj:`bool`
        If ``True``, keep only rows that fall within any interval in `intervals`.
        If ``False``, keep only rows that fall outside all intervals in
        `intervals`.

    Returns
    -------
    :class:`.MatrixTable`
    """

    n_pk = len(ds.partition_key)
    pk_type = ds.partition_key.dtype
    point_type = intervals.dtype.element_type.point_type

    if point_type == pk_type:
        needs_wrapper = False
    elif n_pk == 1 and point_type == ds.partition_key[0].dtype:
        needs_wrapper = True
    else:
        raise TypeError(
            "The point type does not match the row partition key type of the dataset ('{}', '{}')"
            .format(repr(point_type), repr(pk_type)))

    def wrap_input(interval):
        if interval is None:
            raise TypeError(
                "'filter_intervals' does not allow missing values in 'intervals'."
            )
        elif needs_wrapper:
            return Interval(Struct(foo=interval.start),
                            Struct(foo=interval.end), interval.includes_start,
                            interval.includes_end)
        else:
            return interval

    intervals = [wrap_input(x)._jrep for x in intervals.value]
    jmt = Env.hail().methods.FilterIntervals.apply(ds._jvds, intervals, keep)
    return MatrixTable(jmt)
Example #52
0
def sample_qc(mt, name='sample_qc') -> MatrixTable:
    """Compute per-sample metrics useful for quality control.

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    Compute sample QC metrics and remove low-quality samples:

    >>> dataset = hl.sample_qc(dataset, name='sample_qc')
    >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5))

    Notes
    -----

    This method computes summary statistics per sample from a genetic matrix and stores
    the results as a new column-indexed struct field in the matrix, named based on the
    `name` parameter.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `call_rate` (``float64``) -- Fraction of calls not missing or filtered.
      Equivalent to `n_called` divided by :meth:`.count_rows`.
    - `n_called` (``int64``) -- Number of non-missing calls.
    - `n_not_called` (``int64``) -- Number of missing calls.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_hom_ref` (``int64``) -- Number of homozygous reference calls.
    - `n_het` (``int64``) -- Number of heterozygous calls.
    - `n_hom_var` (``int64``) -- Number of homozygous alternate calls.
    - `n_non_ref` (``int64``) -- Sum of `n_het` and `n_hom_var`.
    - `n_snp` (``int64``) -- Number of SNP alternate alleles.
    - `n_insertion` (``int64``) -- Number of insertion alternate alleles.
    - `n_deletion` (``int64``) -- Number of deletion alternate alleles.
    - `n_singleton` (``int64``) -- Number of private alleles.
    - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles.
    - `n_transversion` (``int64``) -- Number of transversion alternate alleles.
    - `n_star` (``int64``) -- Number of star (upstream deletion) alleles.
    - `r_ti_tv` (``float64``) -- Transition/Transversion ratio.
    - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio.
    - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio.

    Missing values ``NA`` may result from division by zero.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new column-indexed field `name`.
    """

    require_row_key_variant(mt, 'sample_qc')

    from hail.expr.functions import _num_allele_type, _allele_types

    allele_types = _allele_types[:]
    allele_types.extend(['Transition', 'Transversion'])
    allele_enum = {i: v for i, v in enumerate(allele_types)}
    allele_ints = {v: k for k, v in allele_enum.items()}

    def allele_type(ref, alt):
        return hl.bind(
            lambda at: hl.cond(
                at == allele_ints['SNP'],
                hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'],
                        allele_ints['Transversion']), at),
            _num_allele_type(ref, alt))

    variant_ac = Env.get_uid()
    variant_atypes = Env.get_uid()
    mt = mt.annotate_rows(
        **{
            variant_ac:
            hl.agg.call_stats(mt.GT, mt.alleles).AC,
            variant_atypes:
            mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))
        })

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            f"'sample_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))

    n_rows_ref = hl.expr.construct_expr(
        hl.ir.Ref('n_rows'), hl.tint64, mt._col_indices,
        hl.utils.LinkedList(hl.expr.Aggregation))
    bound_exprs['n_filtered'] = n_rows_ref - hl.agg.count()
    bound_exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref())
    bound_exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het())
    bound_exprs['n_singleton'] = hl.agg.sum(
        hl.sum(
            hl.range(0, mt['GT'].ploidy).map(
                lambda i: mt[variant_ac][mt['GT'][i]] == 1)))

    def get_allele_type(allele_idx):
        return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1],
                       hl.null(hl.tint32))

    bound_exprs['allele_type_counts'] = hl.agg.explode(
        lambda elt: hl.agg.counter(elt),
        hl.range(0,
                 mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i])))

    zero = hl.int64(0)

    result_struct = hl.rbind(hl.struct(**bound_exprs),
        lambda x: hl.rbind(
            hl.struct(**{
                **gq_dp_exprs,
                'call_rate': hl.float64(x.n_called) / (x.n_called + x.n_not_called + x.n_filtered),
                'n_called': x.n_called,
                'n_not_called': x.n_not_called,
                'n_filtered': x.n_filtered,
                'n_hom_ref': x.n_hom_ref,
                'n_het': x.n_het,
                'n_hom_var': x.n_called - x.n_hom_ref - x.n_het,
                'n_non_ref': x.n_called - x.n_hom_ref,
                'n_singleton': x.n_singleton,
                'n_snp': x.allele_type_counts.get(allele_ints["Transition"], zero) + \
                         x.allele_type_counts.get(allele_ints["Transversion"], zero),
                'n_insertion': x.allele_type_counts.get(allele_ints["Insertion"], zero),
                'n_deletion': x.allele_type_counts.get(allele_ints["Deletion"], zero),
                'n_transition': x.allele_type_counts.get(allele_ints["Transition"], zero),
                'n_transversion': x.allele_type_counts.get(allele_ints["Transversion"], zero),
                'n_star': x.allele_type_counts.get(allele_ints["Star"], zero)
            }),
            lambda s: s.annotate(
                r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion),
                r_het_hom_var=divide_null(hl.float64(s.n_het), s.n_hom_var),
                r_insertion_deletion=divide_null(hl.float64(s.n_insertion), s.n_deletion)
            )))

    mt = mt.annotate_cols(**{name: result_struct})
    mt = mt.drop(variant_ac, variant_atypes)

    return mt
Example #53
0
 def is_file(self, path: str) -> bool:
     return Env.jutils().isFile(path, Env.hc()._jhc)
Example #54
0
 def test_parses(self):
     for x in self.value_irs():
         Env.hail().expr.Parser.parse_value_ir(str(x))
Example #55
0
 def __init__(self, value: Any, dtype: 'hail.HailType'):
     super(Broadcast, self).__init__()
     self.value = value
     self.dtype = dtype
     self.uid = Env.get_uid()
Example #56
0
 def is_dir(self, path: str) -> bool:
     return Env.jutils().isDir(path, Env.hc()._jhc)
Example #57
0
 def stat(self, path: str) -> Dict:
     return json.loads(Env.jutils().stat(path, Env.hc()._jhc))
Example #58
0
 def ls(self, path: str) -> List[Dict]:
     r = Env.jutils().ls(path, Env.hc()._jhc)
     return json.loads(r)
Example #59
0
 def _compute_type(self):
     self._type = Env.backend().table_type(self)
Example #60
0
def concordance(left, right) -> Tuple[List[List[int]], Table, Table]:
    """Calculate call concordance with another dataset.

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    .. include:: ../_templates/req_unphased_diploid_gt.rst

    Examples
    --------

    Compute concordance between two datasets and output the global concordance
    statistics and two tables with concordance computed per column key and per
    row key:

    >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2)

    Notes
    -----

    This method computes the genotype call concordance (from the entry
    field **GT**) between two biallelic variant datasets.  It requires
    unique sample IDs and performs an inner join on samples (only
    samples in both datasets will be considered). In addition, all genotype
    calls must be **diploid** and **unphased**.

    It performs an ordered zip join of the variants.  That means the
    variants of each dataset are sorted, with duplicate variants
    appearing in some random relative order, and then zipped together.
    When a variant appears a different number of times between the two
    datasets, the dataset with the fewer number of instances is padded
    with "no data".  For example, if a variant is only in one dataset,
    then each genotype is treated as "no data" in the other.

    This method returns a tuple of three objects: a nested list of
    list of int with global concordance summary statistics, a table
    with concordance statistics per column key, and a table with
    concordance statistics per row key.

    **Using the global summary result**

    The global summary is a list of list of int (conceptually a 5 by 5 matrix),
    where the indices have special meaning:

    0. No Data (missing variant)
    1. No Call (missing genotype call)
    2. Hom Ref
    3. Heterozygous
    4. Hom Var

    The first index is the state in the left dataset and the second index is
    the state in the right dataset. Typical uses of the summary list are shown
    below.

    >>> summary, samples, variants = hl.concordance(dataset, dataset2)
    >>> left_homref_right_homvar = summary[2][4]
    >>> left_het_right_missing = summary[3][1]
    >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3]
    >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4]
    >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant

    **Using the table results**

    Table 1: Concordance statistics by column

    This table contains the column key field of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of
          the global summary defined above.

    Table 2: Concordance statistics by row

    This table contains the row key fields of `left`, and the following fields:

        - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for
          full definition).
        - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) --
          Array of concordance per state on left and right, matching the structure of the
          global summary defined above.

    In these tables, the column **n_discordant** is provided as a convenience,
    because this is often one of the most useful concordance statistics. This
    value is the number of genotypes which were called (homozygous reference,
    heterozygous, or homozygous variant) in both datasets, but where the call
    did not match between the two.

    The column `concordance` matches the structure of the global summmary,
    which is detailed above. Once again, the first index into this array is the
    state on the left, and the second index is the state on the right. For
    example, ``concordance[1][4]`` is the number of "no call" genotypes on the
    left that were called homozygous variant on the right.

    Parameters
    ----------
    left : :class:`.MatrixTable`
        First dataset to compare.
    right : :class:`.MatrixTable`
        Second dataset to compare.

    Returns
    -------
    (list of list of int, :class:`.Table`, :class:`.Table`)
        The global concordance statistics, a table with concordance statistics
        per column key, and a table with concordance statistics per row key.

    """

    require_col_key_str(left, 'concordance, left')
    require_col_key_str(right, 'concordance, right')
    left = left.select_rows().select_cols().select_globals().select_entries(
        'GT')
    right = right.select_rows().select_cols().select_globals().select_entries(
        'GT')
    left = require_biallelic(left, "concordance, left")
    right = require_biallelic(right, "concordance, right")

    r = Env.hail().methods.CalculateConcordance.apply(left._jvds, right._jvds)
    j_global_conc = r._1()
    col_conc = Table(r._2())
    row_conc = Table(r._3())
    global_conc = [[j_global_conc.apply(j).apply(i) for i in range(5)]
                   for j in range(5)]

    return global_conc, col_conc, row_conc