コード例 #1
0
ファイル: py4j_backend.py プロジェクト: chrisvittal/hail
    def deco(*args, **kwargs):
        import pyspark
        try:
            return f(*args, **kwargs)
        except py4j.protocol.Py4JJavaError as e:
            s = e.java_exception.toString()

            # py4j catches NoSuchElementExceptions to stop array iteration
            if s.startswith('java.util.NoSuchElementException'):
                raise

            tpl = Env.jutils().handleForPython(e.java_exception)
            deepest, full, error_id = tpl._1(), tpl._2(), tpl._3()

            if error_id != -1:
                raise FatalError('Error summary: %s' % (deepest, ),
                                 error_id) from None
            else:
                raise FatalError(
                    '%s\n\nJava stack trace:\n%s\n'
                    'Hail version: %s\n'
                    'Error summary: %s' %
                    (deepest, full, hail.__version__, deepest),
                    error_id) from None
        except pyspark.sql.utils.CapturedException as e:
            raise FatalError(
                '%s\n\nJava stack trace:\n%s\n'
                'Hail version: %s\n'
                'Error summary: %s' %
                (e.desc, e.stackTrace, hail.__version__, e.desc)) from None
コード例 #2
0
    def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree':
        """Read a PLINK .fam file and return a pedigree object.

        **Examples**

        >>> ped = hl.Pedigree.read('data/test.fam')

        Notes
        -------

        See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for
        the required format.

        :param str fam_path: path to .fam file.

        :param str delimiter: Field delimiter.

        :rtype: :class:`.Pedigree`
        """

        trios = []
        missing_sex_count = 0
        missing_sex_values = set()
        with Env.fs().open(fam_path) as file:
            for line in file:
                split_line = re.split(delimiter, line.strip())
                num_fields = len(split_line)
                if num_fields != 6:
                    raise FatalError(
                        "Require 6 fields per line in .fam, but this line has {}: {}"
                        .format(num_fields, line))
                (fam, kid, dad, mom, sex, _) = tuple(split_line)
                # 1 is male, 2 is female, 0 is unknown.
                is_female = sex == "2" if sex == "1" or sex == "2" else None

                if is_female is None:
                    missing_sex_count += 1
                    missing_sex_values.add(kid)

                trio = Trio(kid, fam if fam != "0" else None,
                            dad if dad != "0" else None,
                            mom if mom != "0" else None, is_female)
                trios.append(trio)

        only_ids = [trio.s for trio in trios]
        duplicate_ids = [
            id for id, count in Counter(only_ids).items() if count > 1
        ]
        if duplicate_ids:
            raise FatalError(
                "Invalid pedigree: found duplicate proband IDs\n{}".format(
                    duplicate_ids))

        if missing_sex_count > 0:
            warning(
                "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]"
                .format(missing_sex_count, missing_sex_values))

        return Pedigree(trios)
コード例 #3
0
    def from_numpy(cls, ndarray, block_size=None):
        """Distributes a `NumPy ndarray
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html>`__
        as a block matrix.

        Examples
        --------

        >>> import numpy as np
        >>> a = np.random.rand(10, 20)
        >>> bm = BlockMatrix.from_numpy(a)

        Notes
        -----
        The ndarray must have two dimensions, each of non-zero size.

        The number of entries must be less than :math:`2^{31}`.

        Parameters
        ----------
        ndarray: :class:`numpy.ndarray`
            ndarray with two dimensions, each of non-zero size.
        block_size: :obj:`int`, optional
            Block size. Default given by :meth:`default_block_size`.

        Returns
        -------
        :class:`.BlockMatrix`
        """
        if not block_size:
            block_size = BlockMatrix.default_block_size()

        if ndarray.ndim != 2:
            raise FatalError(
                "from_numpy: ndarray must have two axes, found shape {}".
                format(ndarray.shape))
        n_rows, n_cols = ndarray.shape
        if n_rows == 0 or n_cols == 0:
            raise FatalError(
                "from_numpy: ndarray dimensions must be non-zero, found shape {}"
                .format(ndarray.shape))
        if ndarray.dtype != np.float64:
            ndarray = ndarray.astype(np.float64)

        local_temp_dir = new_local_temp_dir()
        path = local_temp_dir + '/binary'
        uri = local_path_uri(path)

        ndarray.tofile(path)
        return cls.fromfile(uri, n_rows, n_cols, block_size)
コード例 #4
0
ファイル: context.py プロジェクト: pblh123/hail
def _set_flags(**flags):
    available = set(Env.hc()._jhc.flags().available())
    invalid = []
    for flag, value in flags.items():
        if flag in available:
            Env.hc()._jhc.flags().set(flag, value)
        else:
            invalid.append(flag)
    if len(invalid) != 0:
        raise FatalError("Flags {} not valid. Valid flags: \n    {}".format(
            ', '.join(invalid), '\n    '.join(available)))
コード例 #5
0
    def tofile(self, uri):
        """Collects and writes data to a binary file.

        Examples
        --------

        >>> from hail.linalg import BlockMatrix
        >>> import numpy as np
        >>> bm = BlockMatrix.random(10, 20)
        >>> bm.tofile('file:///local/file') # doctest: +SKIP

        To create a :class:`numpy.ndarray` of the same dimensions:

        >>> a = np.fromfile('/local/file').reshape((10, 20)) # doctest: +SKIP

        Notes
        -----
        This method, analogous to `numpy.tofile
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tofile.html>`__,
        produces a binary file of float64 values in row-major order, which can
        be read by functions such as `numpy.fromfile
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html>`__
        (if a local file) and :meth:`BlockMatrix.fromfile`.

        Binary files produced and consumed by :meth:`.tofile` and
        :meth:`.fromfile` are not platform independent, so should only be used
        for inter-operating with NumPy, not storage. Use
        :meth:`BlockMatrix.write` and :meth:`BlockMatrix.read` to save and load
        block matrices, since these methods write and read blocks in parallel
        and are platform independent.

        The number of entries must be less than :math:`2^{31}`.

        Parameters
        ----------
        uri: :obj:`str`, optional
            URI of binary output file.

        See Also
        --------
        :meth:`.to_numpy`
        """
        n_entries = self.n_rows * self.n_cols
        if n_entries >= 2 << 31:
            raise FatalError(
                'Number of entries must be less than 2^31, found {}'.format(
                    n_entries))

        bdm = self._jbm.toBreezeMatrix()
        hc = Env.hc()
        row_major = Env.hail(
        ).utils.richUtils.RichDenseMatrixDouble.exportToDoubles(
            hc._jhc, uri, bdm, True)
        assert row_major
コード例 #6
0
ファイル: py4j_backend.py プロジェクト: chrisvittal/hail
 def set_flags(self, **flags: Mapping[str, str]):
     available = self._jbackend.availableFlags()
     invalid = []
     for flag, value in flags.items():
         if flag in available:
             self._jbackend.setFlag(flag, value)
         else:
             invalid.append(flag)
     if len(invalid) != 0:
         raise FatalError(
             "Flags {} not valid. Valid flags: \n    {}".format(
                 ', '.join(invalid), '\n    '.join(available)))
コード例 #7
0
ファイル: local_backend.py プロジェクト: theferrit32/hail
    def deco(*args, **kwargs):
        try:
            return f(*args, **kwargs)
        except py4j.protocol.Py4JJavaError as e:
            s = e.java_exception.toString()

            # py4j catches NoSuchElementExceptions to stop array iteration
            if s.startswith('java.util.NoSuchElementException'):
                raise

            tpl = Env.jutils().handleForPython(e.java_exception)
            deepest, full = tpl._1(), tpl._2()
            raise FatalError('%s\n\nJava stack trace:\n%s\n'
                             'Hail version: %s\n'
                             'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
コード例 #8
0
ファイル: context.py プロジェクト: pblh123/hail
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log=None,
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 idempotent=False,
                 global_seed=6348563392232659379,
                 _backend=None):

        if Env._hc:
            if idempotent:
                return
            else:
                raise FatalError(
                    'Hail has already been initialized, restart session '
                    'or stop Hail to change configuration.')

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            sys.stderr.write(f'using hail jar at {hail_jar_path}\n')
            conf = SparkConf()
            conf.set('spark.driver.extraClassPath', hail_jar_path)
            conf.set('spark.executor.extraClassPath', hail_jar_path)
            SparkContext._ensure_initialized(conf=conf)
        else:
            SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        if _backend is None:
            _backend = SparkBackend()
        self._backend = _backend

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')

        version = read_version_info()
        hail.__version__ = version

        if log is None:
            log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
                                            suffix=f'-{version}.log')
        self._log = log

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        # if idempotent:
        if idempotent:
            self._jhc = self._hail.HailContext.getOrCreate(
                jsc, app_name, joption(master), local, log, True, append,
                min_block_size, branching_factor, tmp_dir)
        else:
            self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                     joption(master), local,
                                                     log, True, append,
                                                     min_block_size,
                                                     branching_factor, tmp_dir)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        self._default_ref = None
        Env.hail().variant.ReferenceGenome.setDefaultReference(
            self._jhc, default_reference)

        jar_version = self._jhc.version()

        if jar_version != version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {version}")

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        install_exception_handler()
        Env.set_seed(global_seed)
コード例 #9
0
ファイル: context.py プロジェクト: konradjk/hail
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log='hail.log',
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 force_ir=False):

        if Env._hc:
            raise FatalError(
                'Hail has already been initialized, restart session '
                'or stop Hail to change configuration.')

        SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                 joption(master), local, log,
                                                 True, append, min_block_size,
                                                 branching_factor, tmp_dir,
                                                 force_ir)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jsql_context = self._jhc.sqlContext()
        self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context)
        self._counter = 1

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        self._default_ref = None
        Env.hail().variant.ReferenceGenome.setDefaultReference(
            self._jhc, default_reference)

        version = self._jhc.version()
        hail.__version__ = version

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\_,_/_/_/   version {}\n'.format(version))

            if version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')

        install_exception_handler()
コード例 #10
0
ファイル: context.py プロジェクト: nawatts/hail
    def __init__(self,
                 sc=None,
                 app_name="Hail",
                 master=None,
                 local='local[*]',
                 log=None,
                 quiet=False,
                 append=False,
                 min_block_size=1,
                 branching_factor=50,
                 tmp_dir=None,
                 default_reference="GRCh37",
                 idempotent=False,
                 global_seed=6348563392232659379,
                 spark_conf=None,
                 optimizer_iterations=None,
                 _backend=None):

        if Env._hc:
            if idempotent:
                return
            else:
                raise FatalError(
                    'Hail has already been initialized, restart session '
                    'or stop Hail to change configuration.')

        if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"):
            hail_jar_path = pkg_resources.resource_filename(
                __name__, "hail-all-spark.jar")
            assert os.path.exists(
                hail_jar_path), f'{hail_jar_path} does not exist'
            conf = SparkConf()

            base_conf = spark_conf or {}
            for k, v in base_conf.items():
                conf.set(k, v)

            jars = [hail_jar_path]

            if os.environ.get('HAIL_SPARK_MONITOR'):
                import sparkmonitor
                jars.append(
                    os.path.join(os.path.dirname(sparkmonitor.__file__),
                                 'listener.jar'))
                conf.set("spark.extraListeners",
                         "sparkmonitor.listener.JupyterSparkMonitorListener")

            conf.set('spark.jars', ','.join(jars))
            conf.set('spark.driver.extraClassPath', ','.join(jars))
            conf.set('spark.executor.extraClassPath', './hail-all-spark.jar')
            if sc is None:
                SparkContext._ensure_initialized(conf=conf)
            else:
                import warnings
                warnings.warn(
                    'pip-installed Hail requires additional configuration options in Spark referring\n'
                    '  to the path to the Hail Python module directory HAIL_DIR,\n'
                    '  e.g. /path/to/python/site-packages/hail:\n'
                    '    spark.jars=HAIL_DIR/hail-all-spark.jar\n'
                    '    spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n'
                    '    spark.executor.extraClassPath=./hail-all-spark.jar')
        else:
            SparkContext._ensure_initialized()

        self._gateway = SparkContext._gateway
        self._jvm = SparkContext._jvm

        # hail package
        self._hail = getattr(self._jvm, 'is').hail

        self._warn_cols_order = True
        self._warn_entries_order = True

        Env._jvm = self._jvm
        Env._gateway = self._gateway

        jsc = sc._jsc.sc() if sc else None

        if _backend is None:
            if os.environ.get('HAIL_APISERVER_URL') is not None:
                _backend = ServiceBackend()
            else:
                _backend = SparkBackend()
        self._backend = _backend

        tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp')
        optimizer_iterations = get_env_or_default(optimizer_iterations,
                                                  'HAIL_OPTIMIZER_ITERATIONS',
                                                  3)

        py_version = version()

        if log is None:
            log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'),
                                            suffix=f'-{py_version}.log')
        self._log = log

        # we always pass 'quiet' to the JVM because stderr output needs
        # to be routed through Python separately.
        # if idempotent:
        if idempotent:
            self._jhc = self._hail.HailContext.getOrCreate(
                jsc, app_name, joption(master), local, log, True, append,
                min_block_size, branching_factor, tmp_dir,
                optimizer_iterations)
        else:
            self._jhc = self._hail.HailContext.apply(jsc, app_name,
                                                     joption(master), local,
                                                     log, True, append,
                                                     min_block_size,
                                                     branching_factor, tmp_dir,
                                                     optimizer_iterations)

        self._jsc = self._jhc.sc()
        self.sc = sc if sc else SparkContext(
            gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc))
        self._jspark_session = self._jhc.sparkSession()
        self._spark_session = SparkSession(self.sc, self._jhc.sparkSession())

        super(HailContext, self).__init__()

        # do this at the end in case something errors, so we don't raise the above error without a real HC
        Env._hc = self

        ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True)
        ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True)

        if default_reference in ReferenceGenome._references:
            self._default_ref = ReferenceGenome._references[default_reference]
        else:
            self._default_ref = ReferenceGenome.read(default_reference)

        jar_version = self._jhc.version()

        if jar_version != py_version:
            raise RuntimeError(
                f"Hail version mismatch between JAR and Python library\n"
                f"  JAR:    {jar_version}\n"
                f"  Python: {py_version}")

        if not quiet:
            sys.stderr.write('Running on Apache Spark version {}\n'.format(
                self.sc.version))
            if self._jsc.uiWebUrl().isDefined():
                sys.stderr.write('SparkUI available at {}\n'.format(
                    self._jsc.uiWebUrl().get()))

            connect_logger('localhost', 12888)

            self._hail.HailContext.startProgressBar(self._jsc)

            sys.stderr.write(
                'Welcome to\n'
                '     __  __     <>__\n'
                '    / /_/ /__  __/ /\n'
                '   / __  / _ `/ / /\n'
                '  /_/ /_/\\_,_/_/_/   version {}\n'.format(py_version))

            if py_version.startswith('devel'):
                sys.stderr.write(
                    'NOTE: This is a beta version. Interfaces may change\n'
                    '  during the beta period. We recommend pulling\n'
                    '  the latest changes weekly.\n')
            sys.stderr.write(f'LOGGING: writing to {log}\n')

        install_exception_handler()
        Env.set_seed(global_seed)
コード例 #11
0
    def fromfile(cls, uri, n_rows, n_cols, block_size=None):
        """Creates a block matrix from a binary file.

        Examples
        --------

        >>> import numpy as np
        >>> a = np.random.rand(10, 20)
        >>> a.tofile('/local/file') # doctest: +SKIP

        To create a block matrix of the same dimensions:

        >>> from hail.linalg import BlockMatrix
        >>> bm = BlockMatrix.fromfile('file:///local/file', 10, 20) # doctest: +SKIP

        Notes
        -----
        This method, analogous to `numpy.fromfile
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html>`__,
        reads a binary file of float64 values in row-major order, such as that
        produced by `numpy.tofile
        <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tofile.html>`__
        or :meth:`BlockMatrix.tofile`.

        Binary files produced and consumed by :meth:`.tofile` and
        :meth:`.fromfile` are not platform independent, so should only be used
        for inter-operating with NumPy, not storage. Use
        :meth:`BlockMatrix.write` and :meth:`BlockMatrix.read` to save and load
        block matrices, since these methods write and read blocks in parallel
        and are platform independent.

        A NumPy ndarray must have type float64 for the output of
        func:`numpy.tofile` to be a valid binary input to :meth:`.fromfile`.
        This is not checked.

        The number of entries must be less than :math:`2^{31}`.

        Parameters
        ----------
        uri: :obj:`str`, optional
            URI of binary input file.
        n_rows: :obj:`int`
            Number of rows.
        n_cols: :obj:`int`
            Number of columns.
        block_size: :obj:`int`, optional
            Block size. Default given by :meth:`default_block_size`.

        See Also
        --------
        :meth:`.from_numpy`
        """
        if not block_size:
            block_size = BlockMatrix.default_block_size()

        n_entries = n_rows * n_cols
        if n_entries >= 2 << 31:
            raise FatalError(
                'Number of entries must be less than 2^31, found {}'.format(
                    n_entries))

        hc = Env.hc()
        bdm = Env.hail(
        ).utils.richUtils.RichDenseMatrixDouble.importFromDoubles(
            hc._jhc, uri, n_rows, n_cols, True)

        return cls(Env.hail().linalg.BlockMatrix.fromBreezeMatrix(
            hc._jsc, bdm, block_size))