def deco(*args, **kwargs): import pyspark try: return f(*args, **kwargs) except py4j.protocol.Py4JJavaError as e: s = e.java_exception.toString() # py4j catches NoSuchElementExceptions to stop array iteration if s.startswith('java.util.NoSuchElementException'): raise tpl = Env.jutils().handleForPython(e.java_exception) deepest, full, error_id = tpl._1(), tpl._2(), tpl._3() if error_id != -1: raise FatalError('Error summary: %s' % (deepest, ), error_id) from None else: raise FatalError( '%s\n\nJava stack trace:\n%s\n' 'Hail version: %s\n' 'Error summary: %s' % (deepest, full, hail.__version__, deepest), error_id) from None except pyspark.sql.utils.CapturedException as e: raise FatalError( '%s\n\nJava stack trace:\n%s\n' 'Hail version: %s\n' 'Error summary: %s' % (e.desc, e.stackTrace, hail.__version__, e.desc)) from None
def read(cls, fam_path, delimiter='\\s+') -> 'Pedigree': """Read a PLINK .fam file and return a pedigree object. **Examples** >>> ped = hl.Pedigree.read('data/test.fam') Notes ------- See `PLINK .fam file <https://www.cog-genomics.org/plink2/formats#fam>`_ for the required format. :param str fam_path: path to .fam file. :param str delimiter: Field delimiter. :rtype: :class:`.Pedigree` """ trios = [] missing_sex_count = 0 missing_sex_values = set() with Env.fs().open(fam_path) as file: for line in file: split_line = re.split(delimiter, line.strip()) num_fields = len(split_line) if num_fields != 6: raise FatalError( "Require 6 fields per line in .fam, but this line has {}: {}" .format(num_fields, line)) (fam, kid, dad, mom, sex, _) = tuple(split_line) # 1 is male, 2 is female, 0 is unknown. is_female = sex == "2" if sex == "1" or sex == "2" else None if is_female is None: missing_sex_count += 1 missing_sex_values.add(kid) trio = Trio(kid, fam if fam != "0" else None, dad if dad != "0" else None, mom if mom != "0" else None, is_female) trios.append(trio) only_ids = [trio.s for trio in trios] duplicate_ids = [ id for id, count in Counter(only_ids).items() if count > 1 ] if duplicate_ids: raise FatalError( "Invalid pedigree: found duplicate proband IDs\n{}".format( duplicate_ids)) if missing_sex_count > 0: warning( "Found {} samples with missing sex information (not 1 or 2).\n Missing samples: [{}]" .format(missing_sex_count, missing_sex_values)) return Pedigree(trios)
def from_numpy(cls, ndarray, block_size=None): """Distributes a `NumPy ndarray <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.html>`__ as a block matrix. Examples -------- >>> import numpy as np >>> a = np.random.rand(10, 20) >>> bm = BlockMatrix.from_numpy(a) Notes ----- The ndarray must have two dimensions, each of non-zero size. The number of entries must be less than :math:`2^{31}`. Parameters ---------- ndarray: :class:`numpy.ndarray` ndarray with two dimensions, each of non-zero size. block_size: :obj:`int`, optional Block size. Default given by :meth:`default_block_size`. Returns ------- :class:`.BlockMatrix` """ if not block_size: block_size = BlockMatrix.default_block_size() if ndarray.ndim != 2: raise FatalError( "from_numpy: ndarray must have two axes, found shape {}". format(ndarray.shape)) n_rows, n_cols = ndarray.shape if n_rows == 0 or n_cols == 0: raise FatalError( "from_numpy: ndarray dimensions must be non-zero, found shape {}" .format(ndarray.shape)) if ndarray.dtype != np.float64: ndarray = ndarray.astype(np.float64) local_temp_dir = new_local_temp_dir() path = local_temp_dir + '/binary' uri = local_path_uri(path) ndarray.tofile(path) return cls.fromfile(uri, n_rows, n_cols, block_size)
def _set_flags(**flags): available = set(Env.hc()._jhc.flags().available()) invalid = [] for flag, value in flags.items(): if flag in available: Env.hc()._jhc.flags().set(flag, value) else: invalid.append(flag) if len(invalid) != 0: raise FatalError("Flags {} not valid. Valid flags: \n {}".format( ', '.join(invalid), '\n '.join(available)))
def tofile(self, uri): """Collects and writes data to a binary file. Examples -------- >>> from hail.linalg import BlockMatrix >>> import numpy as np >>> bm = BlockMatrix.random(10, 20) >>> bm.tofile('file:///local/file') # doctest: +SKIP To create a :class:`numpy.ndarray` of the same dimensions: >>> a = np.fromfile('/local/file').reshape((10, 20)) # doctest: +SKIP Notes ----- This method, analogous to `numpy.tofile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tofile.html>`__, produces a binary file of float64 values in row-major order, which can be read by functions such as `numpy.fromfile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html>`__ (if a local file) and :meth:`BlockMatrix.fromfile`. Binary files produced and consumed by :meth:`.tofile` and :meth:`.fromfile` are not platform independent, so should only be used for inter-operating with NumPy, not storage. Use :meth:`BlockMatrix.write` and :meth:`BlockMatrix.read` to save and load block matrices, since these methods write and read blocks in parallel and are platform independent. The number of entries must be less than :math:`2^{31}`. Parameters ---------- uri: :obj:`str`, optional URI of binary output file. See Also -------- :meth:`.to_numpy` """ n_entries = self.n_rows * self.n_cols if n_entries >= 2 << 31: raise FatalError( 'Number of entries must be less than 2^31, found {}'.format( n_entries)) bdm = self._jbm.toBreezeMatrix() hc = Env.hc() row_major = Env.hail( ).utils.richUtils.RichDenseMatrixDouble.exportToDoubles( hc._jhc, uri, bdm, True) assert row_major
def set_flags(self, **flags: Mapping[str, str]): available = self._jbackend.availableFlags() invalid = [] for flag, value in flags.items(): if flag in available: self._jbackend.setFlag(flag, value) else: invalid.append(flag) if len(invalid) != 0: raise FatalError( "Flags {} not valid. Valid flags: \n {}".format( ', '.join(invalid), '\n '.join(available)))
def deco(*args, **kwargs): try: return f(*args, **kwargs) except py4j.protocol.Py4JJavaError as e: s = e.java_exception.toString() # py4j catches NoSuchElementExceptions to stop array iteration if s.startswith('java.util.NoSuchElementException'): raise tpl = Env.jutils().handleForPython(e.java_exception) deepest, full = tpl._1(), tpl._2() raise FatalError('%s\n\nJava stack trace:\n%s\n' 'Hail version: %s\n' 'Error summary: %s' % (deepest, full, hail.__version__, deepest)) from None
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' sys.stderr.write(f'using hail jar at {hail_jar_path}\n') conf = SparkConf() conf.set('spark.driver.extraClassPath', hail_jar_path) conf.set('spark.executor.extraClassPath', hail_jar_path) SparkContext._ensure_initialized(conf=conf) else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') version = read_version_info() hail.__version__ = version if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.ReferenceGenome.setDefaultReference( self._jhc, default_reference) jar_version = self._jhc.version() if jar_version != version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log='hail.log', quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", force_ir=False): if Env._hc: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, force_ir) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jsql_context = self._jhc.sqlContext() self._sql_context = SQLContext(self.sc, jsqlContext=self._jsql_context) self._counter = 1 super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self self._default_ref = None Env.hail().variant.ReferenceGenome.setDefaultReference( self._jhc, default_reference) version = self._jhc.version() hail.__version__ = version if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\_,_/_/_/ version {}\n'.format(version)) if version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') install_exception_handler()
def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, spark_conf=None, optimizer_iterations=None, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR'): import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: SparkContext._ensure_initialized(conf=conf) else: import warnings warnings.warn( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: if os.environ.get('HAIL_APISERVER_URL') is not None: _backend = ServiceBackend() else: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') optimizer_iterations = get_env_or_default(optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) py_version = version() if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{py_version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jspark_session = self._jhc.sparkSession() self._spark_session = SparkSession(self.sc, self._jhc.sparkSession()) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True) ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True) ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True) if default_reference in ReferenceGenome._references: self._default_ref = ReferenceGenome._references[default_reference] else: self._default_ref = ReferenceGenome.read(default_reference) jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(py_version)) if py_version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed)
def fromfile(cls, uri, n_rows, n_cols, block_size=None): """Creates a block matrix from a binary file. Examples -------- >>> import numpy as np >>> a = np.random.rand(10, 20) >>> a.tofile('/local/file') # doctest: +SKIP To create a block matrix of the same dimensions: >>> from hail.linalg import BlockMatrix >>> bm = BlockMatrix.fromfile('file:///local/file', 10, 20) # doctest: +SKIP Notes ----- This method, analogous to `numpy.fromfile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.fromfile.html>`__, reads a binary file of float64 values in row-major order, such as that produced by `numpy.tofile <https://docs.scipy.org/doc/numpy/reference/generated/numpy.ndarray.tofile.html>`__ or :meth:`BlockMatrix.tofile`. Binary files produced and consumed by :meth:`.tofile` and :meth:`.fromfile` are not platform independent, so should only be used for inter-operating with NumPy, not storage. Use :meth:`BlockMatrix.write` and :meth:`BlockMatrix.read` to save and load block matrices, since these methods write and read blocks in parallel and are platform independent. A NumPy ndarray must have type float64 for the output of func:`numpy.tofile` to be a valid binary input to :meth:`.fromfile`. This is not checked. The number of entries must be less than :math:`2^{31}`. Parameters ---------- uri: :obj:`str`, optional URI of binary input file. n_rows: :obj:`int` Number of rows. n_cols: :obj:`int` Number of columns. block_size: :obj:`int`, optional Block size. Default given by :meth:`default_block_size`. See Also -------- :meth:`.from_numpy` """ if not block_size: block_size = BlockMatrix.default_block_size() n_entries = n_rows * n_cols if n_entries >= 2 << 31: raise FatalError( 'Number of entries must be less than 2^31, found {}'.format( n_entries)) hc = Env.hc() bdm = Env.hail( ).utils.richUtils.RichDenseMatrixDouble.importFromDoubles( hc._jhc, uri, n_rows, n_cols, True) return cls(Env.hail().linalg.BlockMatrix.fromBreezeMatrix( hc._jsc, bdm, block_size))