Exemple #1
0
class ExportType:
    CONCATENATED = "concatenated"
    PARALLEL_SEPARATE_HEADER = "separate_header"
    PARALLEL_HEADER_IN_SHARD = "header_per_shard"

    checker = enumeration("concatenated", "separate_header", "header_per_shard")

    @staticmethod
    def default(export_type):
        if export_type is None:
            return ExportType.CONCATENATED
        else:
            return export_type
Exemple #2
0
                       matrix_table_source)
from hail.expr.types import tarray
from hail import ir
from hail.linalg import BlockMatrix
from hail.table import Table
from hail.typecheck import typecheck, nullable, numeric, enumeration

from ..pca import hwe_normalized_pca


@typecheck(call_expr=expr_call,
           min_individual_maf=numeric,
           k=nullable(int),
           scores_expr=nullable(expr_array(expr_float64)),
           min_kinship=nullable(numeric),
           statistics=enumeration('kin', 'kin2', 'kin20', 'all'),
           block_size=nullable(int),
           include_self_kinship=bool)
def pc_relate(call_expr,
              min_individual_maf,
              *,
              k=None,
              scores_expr=None,
              min_kinship=None,
              statistics="all",
              block_size=None,
              include_self_kinship=False) -> Table:
    r"""Compute relatedness estimates between individuals using a variant of the
    PC-Relate method.

    .. include:: ../_templates/req_diploid_gt.rst
Exemple #3
0
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=nullable(str),
           default_reference=enumeration(*BUILTIN_REFERENCES),
           idempotent=bool,
           global_seed=nullable(int),
           spark_conf=nullable(dictof(str, str)),
           skip_logging_configuration=bool,
           local_tmpdir=nullable(str),
           _optimizer_iterations=nullable(int))
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
         log=None,
         quiet=False,
         append=False,
         min_block_size=0,
         branching_factor=50,
Exemple #4
0

def new_local_temp_dir(suffix=None, prefix=None, dir=None):
    local_temp_dir = tempfile.mkdtemp(suffix, prefix, dir)
    atexit.register(shutil.rmtree, local_temp_dir)
    return local_temp_dir


def new_local_temp_file(filename="temp"):
    local_temp_dir = new_local_temp_dir()
    path = local_temp_dir + "/" + filename
    return path


storage_level = enumeration('NONE', 'DISK_ONLY', 'DISK_ONLY_2', 'MEMORY_ONLY',
                            'MEMORY_ONLY_2', 'MEMORY_ONLY_SER', 'MEMORY_ONLY_SER_2',
                            'MEMORY_AND_DISK', 'MEMORY_AND_DISK_2', 'MEMORY_AND_DISK_SER',
                            'MEMORY_AND_DISK_SER_2', 'OFF_HEAP')


def run_command(args):
    import subprocess as sp
    try:
        sp.check_output(args, stderr=sp.STDOUT)
    except sp.CalledProcessError as e:
        print(e.output)
        raise e


def plural(orig, n, alternate=None):
    if n == 1:
        return orig
Exemple #5
0
    def upload_log(self):
        self._jhc.uploadLog()


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38'),
           idempotent=bool,
           global_seed=nullable(int),
           _backend=nullable(Backend))
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
         log=None,
         quiet=False,
         append=False,
         min_block_size=1,
         branching_factor=50,
         tmp_dir='/tmp',
         default_reference='GRCh37',
         idempotent=False,
@typecheck(bms=sequenceof(BlockMatrix), prefix=str, overwrite=bool)
def block_matrices_tofiles(bms: List[BlockMatrix],
                           prefix: str,
                           overwrite: bool = False):
    writer = BlockMatrixBinaryMultiWriter(prefix, overwrite)
    Env.backend().execute(
        BlockMatrixMultiWrite([bm._bmir for bm in bms], writer))


@typecheck(bms=sequenceof(BlockMatrix),
           prefix=str,
           overwrite=bool,
           delimiter=str,
           header=nullable(str),
           add_index=bool,
           compression=nullable(enumeration('gz', 'bgz')),
           custom_filenames=nullable(sequenceof(str)))
def export_block_matrices(bms: List[BlockMatrix],
                          prefix: str,
                          overwrite: bool = False,
                          delimiter: str = '\t',
                          header: Optional[str] = None,
                          add_index: bool = False,
                          compression: Optional[str] = None,
                          custom_filenames=None):

    if custom_filenames:
        assert len(custom_filenames) == len(
            bms
        ), "Number of block matrices and number of custom filenames must be equal"
Exemple #7
0
from hail.utils.java import Env
from hail.typecheck import typecheck, enumeration
from typing import Dict, List


@typecheck(path=str,
           mode=enumeration('r', 'w', 'x', 'rb', 'wb', 'xb'),
           buffer_size=int)
def hadoop_open(path: str, mode: str = 'r', buffer_size: int = 8192):
    """Open a file through the Hadoop filesystem API. Supports distributed
    file systems like hdfs, gs, and s3.

    Warning
    -------
    Due to an implementation limitation, :func:`hadoop_open` may be quite
    slow for large data sets (anything larger than 50 MB).

    Examples
    --------
    Write a Pandas DataFrame as a CSV directly into Google Cloud Storage:

    >>> with hadoop_open('gs://my-bucket/df.csv', 'w') as f: # doctest: +SKIP
    ...     pandas_df.to_csv(f)

    Read and print the lines of a text file stored in Google Cloud Storage:

    >>> with hadoop_open('gs://my-bucket/notes.txt') as f: # doctest: +SKIP
    ...     for line in f:
    ...         print(line.strip())

    Write two lines directly to a file in Google Cloud Storage:
Exemple #8
0
        Env._seed_generator = None
        hail.ir.clear_session_functions()
        ReferenceGenome._references = {}


@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38', 'GRCm38',
                                         'CanFam3'),
           idempotent=bool,
           global_seed=nullable(int),
           spark_conf=nullable(dictof(str, str)),
           skip_logging_configuration=bool,
           local_tmpdir=nullable(str),
           _optimizer_iterations=nullable(int))
def init(sc=None,
         app_name='Hail',
         master=None,
         local='local[*]',
         log=None,
         quiet=False,
         append=False,
         min_block_size=0,
         branching_factor=50,
Exemple #9
0
        Env._seed_generator = None

    def upload_log(self):
        self._jhc.uploadLog()

@typecheck(sc=nullable(SparkContext),
           app_name=str,
           master=nullable(str),
           local=str,
           log=nullable(str),
           quiet=bool,
           append=bool,
           min_block_size=int,
           branching_factor=int,
           tmp_dir=str,
           default_reference=enumeration('GRCh37', 'GRCh38'),
           idempotent=bool,
           global_seed=nullable(int),
           _backend=nullable(Backend))
def init(sc=None, app_name='Hail', master=None, local='local[*]',
         log=None, quiet=False, append=False,
         min_block_size=1, branching_factor=50, tmp_dir='/tmp',
         default_reference='GRCh37', idempotent=False,
         global_seed=6348563392232659379, _backend=None):
    """Initialize Hail and Spark.

    Parameters
    ----------
    sc : pyspark.SparkContext, optional
        Spark context. By default, a Spark context will be created.
    app_name : :obj:`str`
Exemple #10
0

def new_local_temp_dir(suffix=None, prefix=None, dir=None):
    local_temp_dir = tempfile.mkdtemp(suffix, prefix, dir)
    atexit.register(shutil.rmtree, local_temp_dir)
    return local_temp_dir


def new_local_temp_file(filename="temp"):
    local_temp_dir = new_local_temp_dir()
    path = local_temp_dir + "/" + filename
    return path


storage_level = enumeration('NONE', 'DISK_ONLY', 'DISK_ONLY_2', 'MEMORY_ONLY',
                            'MEMORY_ONLY_2', 'MEMORY_ONLY_SER', 'MEMORY_ONLY_SER_2',
                            'MEMORY_AND_DISK', 'MEMORY_AND_DISK_2', 'MEMORY_AND_DISK_SER',
                            'MEMORY_AND_DISK_SER_2', 'OFF_HEAP')


def run_command(args):
    import subprocess as sp
    try:
        sp.check_output(args, stderr=sp.STDOUT)
    except sp.CalledProcessError as e:
        print(e.output)
        raise e


def plural(orig, n, alternate=None):
    if n == 1:
        return orig
Exemple #11
0
from hail.utils.java import Env
from hail.typecheck import typecheck, enumeration
from typing import Dict, List


@typecheck(path=str,
           mode=enumeration('r', 'w', 'x', 'rb', 'wb', 'xb'),
           buffer_size=int)
def hadoop_open(path: str, mode: str = 'r', buffer_size: int = 8192):
    """Open a file through the Hadoop filesystem API. Supports distributed
    file systems like hdfs, gs, and s3.

    Warning
    -------
    Due to an implementation limitation, :func:`hadoop_open` may be quite
    slow for large data sets (anything larger than 50 MB).

    Examples
    --------
    Write a Pandas DataFrame as a CSV directly into Google Cloud Storage:

    >>> with hadoop_open('gs://my-bucket/df.csv', 'w') as f: # doctest: +SKIP
    ...     pandas_df.to_csv(f)

    Read and print the lines of a text file stored in Google Cloud Storage:

    >>> with hadoop_open('gs://my-bucket/notes.txt') as f: # doctest: +SKIP
    ...     for line in f:
    ...         print(line.strip())

    Write two lines directly to a file in Google Cloud Storage:
Exemple #12
0
    Returns
    -------
    :class:`.VariantDataset`.
    """
    if keep:
        variant_data = vds.variant_data.semi_join_rows(variants_table)
    else:
        variant_data = vds.variant_data.anti_join_rows(variants_table)
    return VariantDataset(vds.reference_data, variant_data)


@typecheck(vds=VariantDataset,
           intervals=oneof(Table, expr_array(expr_interval(expr_any))),
           keep=bool,
           mode=enumeration('variants_only', 'split_at_boundaries',
                            'unchecked_filter_both'))
def _parameterized_filter_intervals(vds: 'VariantDataset', intervals,
                                    keep: bool, mode: str) -> 'VariantDataset':
    intervals_table = None
    if isinstance(intervals, Table):
        expected = hl.tinterval(hl.tlocus(vds.reference_genome))
        if len(intervals.key) != 1 or intervals.key[0].dtype != hl.tinterval(
                hl.tlocus(vds.reference_genome)):
            raise ValueError(
                f"'filter_intervals': expect a table with a single key of type {expected}; "
                f"found {list(intervals.key.dtype.values())}")
        intervals_table = intervals
        intervals = intervals.aggregate(hl.agg.collect(intervals.key[0]))

    if mode == 'variants_only':
        variant_data = hl.filter_intervals(vds.variant_data, intervals, keep)