@typecheck(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=nullable(str), default_reference=enumeration(*BUILTIN_REFERENCES), idempotent=bool, global_seed=nullable(int), spark_conf=nullable(dictof(str, str)), skip_logging_configuration=bool, local_tmpdir=nullable(str), _optimizer_iterations=nullable(int)) def init(sc=None, app_name='Hail', master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=0, branching_factor=50, tmp_dir=None, default_reference='GRCh37', idempotent=False,
class HailContext(object): @typecheck_method(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=nullable(str), default_reference=str, idempotent=bool, global_seed=nullable(int), spark_conf=nullable(dictof(str, str)), optimizer_iterations=nullable(int), _backend=nullable(Backend)) def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, spark_conf=None, optimizer_iterations=None, _backend=None): if Env._hc: if idempotent: return else: raise FatalError( 'Hail has already been initialized, restart session ' 'or stop Hail to change configuration.') if pkg_resources.resource_exists(__name__, "hail-all-spark.jar"): hail_jar_path = pkg_resources.resource_filename( __name__, "hail-all-spark.jar") assert os.path.exists( hail_jar_path), f'{hail_jar_path} does not exist' conf = SparkConf() base_conf = spark_conf or {} for k, v in base_conf.items(): conf.set(k, v) jars = [hail_jar_path] if os.environ.get('HAIL_SPARK_MONITOR'): import sparkmonitor jars.append( os.path.join(os.path.dirname(sparkmonitor.__file__), 'listener.jar')) conf.set("spark.extraListeners", "sparkmonitor.listener.JupyterSparkMonitorListener") conf.set('spark.jars', ','.join(jars)) conf.set('spark.driver.extraClassPath', ','.join(jars)) conf.set('spark.executor.extraClassPath', './hail-all-spark.jar') if sc is None: SparkContext._ensure_initialized(conf=conf) else: import warnings warnings.warn( 'pip-installed Hail requires additional configuration options in Spark referring\n' ' to the path to the Hail Python module directory HAIL_DIR,\n' ' e.g. /path/to/python/site-packages/hail:\n' ' spark.jars=HAIL_DIR/hail-all-spark.jar\n' ' spark.driver.extraClassPath=HAIL_DIR/hail-all-spark.jar\n' ' spark.executor.extraClassPath=./hail-all-spark.jar') else: SparkContext._ensure_initialized() self._gateway = SparkContext._gateway self._jvm = SparkContext._jvm # hail package self._hail = getattr(self._jvm, 'is').hail self._warn_cols_order = True self._warn_entries_order = True Env._jvm = self._jvm Env._gateway = self._gateway jsc = sc._jsc.sc() if sc else None if _backend is None: if os.environ.get('HAIL_APISERVER_URL') is not None: _backend = ServiceBackend() else: _backend = SparkBackend() self._backend = _backend tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') optimizer_iterations = get_env_or_default(optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) py_version = version() if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{py_version}.log') self._log = log # we always pass 'quiet' to the JVM because stderr output needs # to be routed through Python separately. # if idempotent: if idempotent: self._jhc = self._hail.HailContext.getOrCreate( jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) else: self._jhc = self._hail.HailContext.apply(jsc, app_name, joption(master), local, log, True, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) self._jsc = self._jhc.sc() self.sc = sc if sc else SparkContext( gateway=self._gateway, jsc=self._jvm.JavaSparkContext(self._jsc)) self._jspark_session = self._jhc.sparkSession() self._spark_session = SparkSession(self.sc, self._jhc.sparkSession()) super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True) ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True) ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True) if default_reference in ReferenceGenome._references: self._default_ref = ReferenceGenome._references[default_reference] else: self._default_ref = ReferenceGenome.read(default_reference) jar_version = self._jhc.version() if jar_version != py_version: raise RuntimeError( f"Hail version mismatch between JAR and Python library\n" f" JAR: {jar_version}\n" f" Python: {py_version}") if not quiet: sys.stderr.write('Running on Apache Spark version {}\n'.format( self.sc.version)) if self._jsc.uiWebUrl().isDefined(): sys.stderr.write('SparkUI available at {}\n'.format( self._jsc.uiWebUrl().get())) connect_logger('localhost', 12888) self._hail.HailContext.startProgressBar(self._jsc) sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(py_version)) if py_version.startswith('devel'): sys.stderr.write( 'NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed) @property def default_reference(self): return self._default_ref def stop(self): Env.hail().HailContext.clear() self.sc.stop() self.sc = None Env._jvm = None Env._gateway = None Env._hc = None uninstall_exception_handler() Env._dummy_table = None Env._seed_generator = None hail.ir.clear_session_functions() ReferenceGenome._references = {}
class ReferenceGenome(object): """An object that represents a `reference genome <https://en.wikipedia.org/wiki/Reference_genome>`__. Examples -------- >>> contigs = ["1", "X", "Y", "MT"] >>> lengths = {"1": 249250621, "X": 155270560, "Y": 59373566, "MT": 16569} >>> par = [("X", 60001, 2699521)] >>> my_ref = hl.ReferenceGenome("my_ref", contigs, lengths, "X", "Y", "MT", par) Notes ----- Hail comes with predefined reference genomes (case sensitive!): - GRCh37, Genome Reference Consortium Human Build 37 - GRCh38, Genome Reference Consortium Human Build 38 - GRCm38, Genome Reference Consortium Mouse Build 38 - CanFam3, Canis lupus familiaris (dog) You can access these reference genome objects using :func:`~hail.get_reference`: >>> rg = hl.get_reference('GRCh37') >>> rg = hl.get_reference('GRCh38') >>> rg = hl.get_reference('GRCm38') >>> rg = hl.get_reference('CanFam3') Note that constructing a new reference genome, either by using the class constructor or by using `read` will add the reference genome to the list of known references; it is possible to access the reference genome using :func:`~hail.get_reference` anytime afterwards. Note ---- Reference genome names must be unique. It is not possible to overwrite the built-in reference genomes. Parameters ---------- name : :class:`str` Name of reference. Must be unique and NOT one of Hail's predefined references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'`` and ``'default'``. contigs : :obj:`list` of :class:`str` Contig names. lengths : :obj:`dict` of :class:`str` to :obj:`int` Dict of contig names to contig lengths. x_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) """ _references = {} @classmethod def _from_config(cls, config, _builtin=False): def par_tuple(p): assert p['start']['contig'] == p['end']['contig'] return (p['start']['contig'], p['start']['position'], p['end']['position']) contigs = config['contigs'] return ReferenceGenome(config['name'], [c['name'] for c in contigs], {c['name']: c['length'] for c in contigs}, config['xContigs'], config['yContigs'], config['mtContigs'], [par_tuple(p) for p in config['par']], _builtin) @typecheck_method(name=str, contigs=sequenceof(str), lengths=dictof(str, int), x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int)), _builtin=bool) def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{ 'name': c, 'length': l } for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{ 'start': { 'contig': c, 'position': s }, 'end': { 'contig': c, 'position': e } } for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [ hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par ] self._global_positions = None ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) self._sequence_files = None self._liftovers = dict() def __str__(self): return self._config['name'] def __repr__(self): return 'ReferenceGenome(name=%s, contigs=%s, lengths=%s, x_contigs=%s, y_contigs=%s, mt_contigs=%s, par=%s)' % \ (self.name, self.contigs, self.lengths, self.x_contigs, self.y_contigs, self.mt_contigs, self._par_tuple) def __eq__(self, other): return isinstance(other, ReferenceGenome) and self._config == other._config def __hash__(self): return hash(self.name) @property def name(self): """Name of reference genome. Returns ------- :class:`str` """ return self._config['name'] @property def contigs(self): """Contig names. Returns ------- :obj:`list` of :class:`str` """ return self._contigs @property def lengths(self): """Dict of contig name to contig length. Returns ------- :obj:`dict` of :class:`str` to :obj:`int` """ return self._lengths @property def x_contigs(self): """X contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['xContigs'] @property def y_contigs(self): """Y contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['yContigs'] @property def mt_contigs(self): """Mitochondrial contigs. Returns ------- :obj:`list` of :class:`str` """ return self._config['mtContigs'] @property def par(self): """Pseudoautosomal regions. Returns ------- :obj:`list` of :class:`.Interval` """ return self._par @typecheck_method(contig=str) def contig_length(self, contig): """Contig length. Parameters ---------- contig : :class:`str` Contig name. Returns ------- :obj:`int` Length of contig. """ if contig in self.lengths: return self.lengths[contig] else: raise KeyError( "Contig `{}' is not in reference genome.".format(contig)) @typecheck_method(contig=str) def _contig_global_position(self, contig): if self._global_positions is None: gp = {} lengths = self._lengths x = 0 for c in self.contigs: gp[c] = x x += lengths[c] self._global_positions = gp return self._global_positions[contig] @classmethod @typecheck_method(path=str) def read(cls, path): """Load reference genome from a JSON file. Notes ----- The JSON file must have the following format: .. code-block:: text {"name": "my_reference_genome", "contigs": [{"name": "1", "length": 10000000}, {"name": "2", "length": 20000000}, {"name": "X", "length": 19856300}, {"name": "Y", "length": 78140000}, {"name": "MT", "length": 532}], "xContigs": ["X"], "yContigs": ["Y"], "mtContigs": ["MT"], "par": [{"start": {"contig": "X","position": 60001},"end": {"contig": "X","position": 2699521}}, {"start": {"contig": "Y","position": 10001},"end": {"contig": "Y","position": 2649521}}] } `name` must be unique and not overlap with Hail's pre-instantiated references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, ``'CanFam3'``, and ``'default'``. The contig names in `xContigs`, `yContigs`, and `mtContigs` must be present in `contigs`. The intervals listed in `par` must have contigs in either `xContigs` or `yContigs` and must have positions between 0 and the contig length given in `contigs`. Parameters ---------- path : :class:`str` Path to JSON file. Returns ------- :class:`.ReferenceGenome` """ with hl.hadoop_open(path) as f: return ReferenceGenome._from_config(json.load(f)) @typecheck_method(output=str) def write(self, output): """"Write this reference genome to a file in JSON format. Examples -------- >>> my_rg = hl.ReferenceGenome("new_reference", ["x", "y", "z"], {"x": 500, "y": 300, "z": 200}) >>> my_rg.write(f"output/new_reference.json") Notes ----- Use :meth:`~hail.genetics.ReferenceGenome.read` to reimport the exported reference genome in a new HailContext session. Parameters ---------- output : :class:`str` Path of JSON file to write. """ with hl.utils.hadoop_open(output, 'w') as f: json.dump(self._config, f) @typecheck_method(fasta_file=str, index_file=nullable(str)) def add_sequence(self, fasta_file, index_file=None): """Load the reference sequence from a FASTA file. Examples -------- Access the GRCh37 reference genome using :func:`~hail.get_reference`: >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP Add a sequence file: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz', ... 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP Add a sequence file with the default index location: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for some of Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :class:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`None` or :class:`str` Path to FASTA index file. Must be uncompressed. If `None`, replace the fasta_file's extension with `fai`. """ if index_file is None: index_file = re.sub(r'\.[^.]*$', '.fai', fasta_file) Env.backend().add_sequence(self.name, fasta_file, index_file) self._sequence_files = (fasta_file, index_file) def has_sequence(self): """True if the reference sequence has been loaded. Returns ------- :obj:`bool` """ return self._sequence_files is not None def remove_sequence(self): """Remove the reference sequence.""" self._sequence_files = None Env.backend().remove_sequence(self.name) @classmethod @typecheck_method(name=str, fasta_file=str, index_file=str, x_contigs=oneof(str, sequenceof(str)), y_contigs=oneof(str, sequenceof(str)), mt_contigs=oneof(str, sequenceof(str)), par=sequenceof(sized_tupleof(str, int, int))) def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :class:`str` Name for new reference genome. fasta_file : :class:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :class:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :class:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ par_strings = [ "{}:{}-{}".format(contig, start, end) for (contig, start, end) in par ] Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings) rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True) rg._sequence_files = (fasta_file, index_file) return rg @typecheck_method(dest_reference_genome=reference_genome_type) def has_liftover(self, dest_reference_genome): """``True`` if a liftover chain file is available from this reference genome to the destination reference. Parameters ---------- dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` Returns ------- :obj:`bool` """ return dest_reference_genome.name in self._liftovers @typecheck_method(dest_reference_genome=reference_genome_type) def remove_liftover(self, dest_reference_genome): """Remove liftover to `dest_reference_genome`. Parameters ---------- dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` """ if dest_reference_genome.name in self._liftovers: del self._liftovers[dest_reference_genome.name] Env.backend().remove_liftover(self.name, dest_reference_genome.name) @typecheck_method(chain_file=str, dest_reference_genome=reference_genome_type) def add_liftover(self, chain_file, dest_reference_genome): """Register a chain file for liftover. Examples -------- Access GRCh37 and GRCh38 using :func:`~hail.get_reference`: >>> rg37 = hl.get_reference('GRCh37') # doctest: +SKIP >>> rg38 = hl.get_reference('GRCh38') # doctest: +SKIP Add a chain file from 37 to 38: >>> rg37.add_liftover('gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_liftover` to test whether a chain file has been registered. The chain file format is described `here <https://genome.ucsc.edu/goldenpath/help/chain.html>`__. Chain files are hosted on google cloud for some of Hail's built-in references: **GRCh37 to GRCh38** gs://hail-common/references/grch37_to_grch38.over.chain.gz **GRCh38 to GRCh37** gs://hail-common/references/grch38_to_grch37.over.chain.gz Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- chain_file : :class:`str` Path to chain file. Can be compressed (GZIP) or uncompressed. dest_reference_genome : :class:`str` or :class:`.ReferenceGenome` Reference genome to convert to. """ Env.backend().add_liftover(self.name, chain_file, dest_reference_genome.name) if dest_reference_genome.name in self._liftovers: raise KeyError( f"Liftover already exists from {self.name} to {dest_reference_genome.name}." ) self._liftovers[dest_reference_genome.name] = chain_file
class HailContext(object): @typecheck_method(sc=nullable(SparkContext), app_name=str, master=nullable(str), local=str, log=nullable(str), quiet=bool, append=bool, min_block_size=int, branching_factor=int, tmp_dir=nullable(str), default_reference=str, idempotent=bool, global_seed=nullable(int), spark_conf=nullable(dictof(str, str)), optimizer_iterations=nullable(int), _backend=nullable(Backend)) def __init__(self, sc=None, app_name="Hail", master=None, local='local[*]', log=None, quiet=False, append=False, min_block_size=1, branching_factor=50, tmp_dir=None, default_reference="GRCh37", idempotent=False, global_seed=6348563392232659379, spark_conf=None, optimizer_iterations=None, _backend=None): if Env._hc: if idempotent: return else: warn('Hail has already been initialized. If this call was intended to change configuration,' ' close the session with hl.stop() first.') py_version = version() if log is None: log = hail.utils.timestamp_path(os.path.join(os.getcwd(), 'hail'), suffix=f'-{py_version}.log') self._log = log tmp_dir = get_env_or_default(tmp_dir, 'TMPDIR', '/tmp') self.tmp_dir = tmp_dir optimizer_iterations = get_env_or_default(optimizer_iterations, 'HAIL_OPTIMIZER_ITERATIONS', 3) if _backend is None: if os.environ.get('HAIL_APISERVER_URL') is not None: _backend = ServiceBackend() else: _backend = SparkBackend( idempotent, sc, spark_conf, app_name, master, local, log, quiet, append, min_block_size, branching_factor, tmp_dir, optimizer_iterations) self._backend = _backend self._warn_cols_order = True self._warn_entries_order = True super(HailContext, self).__init__() # do this at the end in case something errors, so we don't raise the above error without a real HC Env._hc = self ReferenceGenome._from_config(_backend.get_reference('GRCh37'), True) ReferenceGenome._from_config(_backend.get_reference('GRCh38'), True) ReferenceGenome._from_config(_backend.get_reference('GRCm38'), True) ReferenceGenome._from_config(_backend.get_reference('CanFam3'), True) if default_reference in ReferenceGenome._references: self._default_ref = ReferenceGenome._references[default_reference] else: self._default_ref = ReferenceGenome.read(default_reference) if not quiet: sys.stderr.write( 'Welcome to\n' ' __ __ <>__\n' ' / /_/ /__ __/ /\n' ' / __ / _ `/ / /\n' ' /_/ /_/\\_,_/_/_/ version {}\n'.format(py_version)) if py_version.startswith('devel'): sys.stderr.write('NOTE: This is a beta version. Interfaces may change\n' ' during the beta period. We recommend pulling\n' ' the latest changes weekly.\n') sys.stderr.write(f'LOGGING: writing to {log}\n') install_exception_handler() Env.set_seed(global_seed) @property def default_reference(self): return self._default_ref def stop(self): self._backend.stop() self._backend = None Env._jvm = None Env._hc = None uninstall_exception_handler() Env._dummy_table = None Env._seed_generator = None hail.ir.clear_session_functions() ReferenceGenome._references = {}