def hadoop_copy(src, dest): """Copy a file through the Hadoop filesystem API. Supports distributed file systems like hdfs, gs, and s3. Examples -------- Copy a file from Google Cloud Storage to a local file: >>> hadoop_copy('gs://hail-common/LCR.interval_list', ... 'file:///mnt/data/LCR.interval_list') # doctest: +SKIP Notes ---- Try using :func:`.hadoop_open` first, it's simpler, but not great for large data! For example: >>> with hadoop_open('gs://my_bucket/results.csv', 'w') as f: #doctest: +SKIP ... pandas_df.to_csv(f) The provided source and destination file paths must be URIs (uniform resource identifiers). Parameters ---------- src: :obj:`str` Source file URI. dest: :obj:`str` Destination file URI. """ Env.jutils().copyFile(src, dest, Env.hc()._jhc)
def copy_log(path: str) -> None: """Attempt to copy the session log to a hadoop-API-compatible location. Examples -------- Specify a manual path: >>> hl.copy_log('gs://my-bucket/analysis-10-jan19.log') # doctest: +SKIP INFO: copying log to 'gs://my-bucket/analysis-10-jan19.log'... Copy to a directory: >>> hl.copy_log('gs://my-bucket/') # doctest: +SKIP INFO: copying log to 'gs://my-bucket/hail-20180924-2018-devel-46e5fad57524.log'... Notes ----- Since Hail cannot currently log directly to distributed file systems, this function is provided as a utility for offloading logs from ephemeral nodes. If `path` is a directory, then the log file will be copied using its base name to the directory (e.g. ``/home/hail.log`` would be copied as ``gs://my-bucket/hail.log`` if `path` is ``gs://my-bucket``. Parameters ---------- path: :obj:`str` """ Env.fs().copy_log(path)
def hadoop_ls(path: str) -> List[Dict]: """Returns information about files at `path`. Notes ----- Raises an error if `path` does not exist. If `path` is a file, returns a list with one element. If `path` is a directory, returns an element for each file contained in `path` (does not search recursively). Each dict element of the result list contains the following data: - is_dir (:obj:`bool`) -- Path is a directory. - size_bytes (:obj:`int`) -- Size in bytes. - size (:obj:`str`) -- Size as a readable string. - modification_time (:obj:`str`) -- Time of last file modification. - owner (:obj:`str`) -- Owner. - path (:obj:`str`) -- Path. Parameters ---------- path : :obj:`str` Returns ------- :obj:`List[Dict]` """ r = Env.jutils().ls(path, Env.hc()._jhc) return json.loads(r)
def _convert_to_j(self, annotation): if annotation is not None: return scala_object(Env.hail().annotations, 'Annotation').fromSeq( Env.jutils().arrayListToISeq( [t._convert_to_j(annotation.get(f)) for f, t in self.items()])) else: return None
def get_reference(name) -> 'hail.ReferenceGenome': """Returns the reference genome corresponding to `name`. Notes ----- Hail's built-in references are ``'GRCh37'``, ``GRCh38'``, and ``'GRCm38'``. The contig names and lengths come from the GATK resource bundle: `human_g1k_v37.dict <ftp://[email protected]/bundle/b37/human_g1k_v37.dict>`__ and `Homo_sapiens_assembly38.dict <ftp://[email protected]/bundle/hg38/Homo_sapiens_assembly38.dict>`__. If ``name='default'``, the value of :func:`.default_reference` is returned. Parameters ---------- name : :obj:`str` Name of a previously loaded reference genome or one of Hail's built-in references: ``'GRCh37'``, ``'GRCh38'``, ``'GRCm38'``, and ``'default'``. Returns ------- :class:`.ReferenceGenome` """ Env.hc() if name == 'default': return default_reference() else: return ReferenceGenome._references[name]
def hadoop_stat(path: str) -> Dict: """Returns information about the file or directory at a given path. Notes ----- Raises an error if `path` does not exist. The resulting dictionary contains the following data: - is_dir (:obj:`bool`) -- Path is a directory. - size_bytes (:obj:`int`) -- Size in bytes. - size (:obj:`str`) -- Size as a readable string. - modification_time (:obj:`str`) -- Time of last file modification. - owner (:obj:`str`) -- Owner. - path (:obj:`str`) -- Path. Parameters ---------- path : :obj:`str` Returns ------- :obj:`Dict` """ return json.loads(Env.jutils().stat(path, Env.hc()._jhc))
def __del__(self): try: Env.hc()._jhc.pyRemoveIrVector(self.jid) # there is only so much we can do if the attempt to remove the unused IR fails, # especially since this will often get called during interpreter shutdown. except Exception: pass
def disable_pipeline_upload(): """Disable the uploading of pipelines. By default, pipeline upload is disabled. """ Env.hc()._jhc.disablePipelineUpload()
def __init__(self, name, contigs, lengths, x_contigs=[], y_contigs=[], mt_contigs=[], par=[], _builtin=False): super(ReferenceGenome, self).__init__() contigs = wrap_to_list(contigs) x_contigs = wrap_to_list(x_contigs) y_contigs = wrap_to_list(y_contigs) mt_contigs = wrap_to_list(mt_contigs) self._config = { 'name': name, 'contigs': [{'name': c, 'length': l} for c, l in lengths.items()], 'xContigs': x_contigs, 'yContigs': y_contigs, 'mtContigs': mt_contigs, 'par': [{'start': {'contig': c, 'position': s}, 'end': {'contig': c, 'position': e}} for (c, s, e) in par] } self._contigs = contigs self._lengths = lengths self._par_tuple = par self._par = [hl.Interval(hl.Locus(c, s, self), hl.Locus(c, e, self)) for (c, s, e) in par] ReferenceGenome._references[name] = self if not _builtin: Env.backend().add_reference(self._config) hl.ir.register_reference_genome_functions(name) self._has_sequence = False self._liftovers = set()
def from_fasta_file(cls, name, fasta_file, index_file, x_contigs=[], y_contigs=[], mt_contigs=[], par=[]): """Create reference genome from a FASTA file. Parameters ---------- name: :obj:`str` Name for new reference genome. fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`str` Path to FASTA index file. Must be uncompressed. x_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as X chromosomes. y_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as Y chromosomes. mt_contigs : :obj:`str` or :obj:`list` of :obj:`str` Contigs to be treated as mitochondrial DNA. par : :obj:`list` of :obj:`tuple` of (str, int, int) List of tuples with (contig, start, end) Returns ------- :class:`.ReferenceGenome` """ par_strings = ["{}:{}-{}".format(contig, start, end) for (contig, start, end) in par] Env.backend().from_fasta_file(name, fasta_file, index_file, x_contigs, y_contigs, mt_contigs, par_strings) rg = ReferenceGenome._from_config(Env.backend().get_reference(name), _builtin=True) rg._has_sequence = True return rg
def eval_timed(expression): """Evaluate a Hail expression, returning the result and the times taken for each stage in the evaluation process. Parameters ---------- expression : :class:`.Expression` Any expression, or a Python value that can be implicitly interpreted as an expression. Returns ------- (Any, dict) Result of evaluating `expression` and a dictionary of the timings """ from hail.utils.java import Env analyze('eval_timed', expression, Indices(expression._indices.source)) if expression._indices.source is None: ir_type = expression._ir.typ expression_type = expression.dtype if ir_type != expression.dtype: raise ExpressionException(f'Expression type and IR type differed: \n{ir_type}\n vs \n{expression_type}') return Env.backend().execute(expression._ir, True) else: uid = Env.get_uid() ir = expression._indices.source.select_globals(**{uid: expression}).index_globals()[uid]._ir return Env.backend().execute(ir, True)
def test_seeding_is_consistent(self): hl.set_global_seed(0) a = [Env.next_seed() for _ in range(10)] hl.set_global_seed(0) b = [Env.next_seed() for _ in range(10)] self.assertEqual(len(set(a)), 10) self.assertEqual(a, b)
def blocking_execute(code): jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {}) typ = hl.dtype(jir.typ().toString()) result = Env.hail().backend.spark.SparkBackend.executeJSON(jir) return { 'type': str(typ), 'result': result }
def remove_sequence(self): """Remove the reference sequence. Returns ------- :obj:`bool` """ self._has_sequence = False Env.backend().remove_sequence(self.name)
def remove_liftover(self, dest_reference_genome): """Remove liftover to `dest_reference_genome`. Parameters ---------- dest_reference_genome : :obj:`str` or :class:`.ReferenceGenome` """ if dest_reference_genome.name in self._liftovers: self._liftovers.remove(dest_reference_genome.name) Env.backend().remove_liftover(self.name, dest_reference_genome.name)
def copy_log(self, path: str) -> None: log = Env.hc()._log try: if self.is_dir(path): _, tail = os.path.split(log) path = os.path.join(path, tail) info(f"copying log to {repr(path)}...") self.copy(local_path_uri(Env.hc()._log), path) except Exception as e: sys.stderr.write(f'Could not copy log: encountered error:\n {e}')
def upload_log(): """Uploads the Hail log to the Hail team. Warning ------- Shares potentially sensitive data with the Hail team. """ Env.hc()._jhc.uploadLog()
def stop(self): Env.hail().HailContext.clear() self.sc.stop() self.sc = None Env._jvm = None Env._gateway = None Env._hc = None uninstall_exception_handler() Env._dummy_table = None Env._seed_generator = None
def set_global_seed(seed): """Sets Hail's global seed to `seed`. Parameters ---------- seed : :obj:`int` Integer used to seed Hail's random number generator """ Env.set_seed(seed)
def _set_flags(**flags): available = set(Env.hc()._jhc.flags().available()) invalid = [] for flag, value in flags.items(): if flag in available: Env.hc()._jhc.flags().set(flag, value) else: invalid.append(flag) if len(invalid) != 0: raise FatalError("Flags {} not valid. Valid flags: \n {}" .format(', '.join(invalid), '\n '.join(available)))
def hadoop_exists(path: str) -> bool: """Returns ``True`` if `path` exists. Parameters ---------- path : :obj:`str` Returns ------- :obj:`.bool` """ return Env.jutils().exists(path, Env.hc()._jhc)
def hadoop_is_file(path: str) -> bool: """Returns ``True`` if `path` both exists and is a file. Parameters ---------- path : :obj:`str` Returns ------- :obj:`.bool` """ return Env.jutils().isFile(path, Env.hc()._jhc)
def hadoop_is_dir(path) -> bool: """Returns ``True`` if `path` both exists and is a directory. Parameters ---------- path : :obj:`str` Returns ------- :obj:`.bool` """ return Env.jutils().isDir(path, Env.hc()._jhc)
def stop(self): Env.hail().HailContext.clear() self.sc.stop() self.sc = None Env._jvm = None Env._gateway = None Env._hc = None uninstall_exception_handler() Env._dummy_table = None Env._seed_generator = None hail.ir.clear_session_functions() ReferenceGenome._references = {}
def add_sequence(self, fasta_file, index_file=None): """Load the reference sequence from a FASTA file. Examples -------- Access the GRCh37 reference genome using :func:`.get_reference`: >>> rg = hl.get_reference('GRCh37') # doctest: +SKIP Add a sequence file: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz', ... 'gs://hail-common/references/human_g1k_v37.fasta.fai') # doctest: +SKIP Add a sequence file with the default index location: >>> rg.add_sequence('gs://hail-common/references/human_g1k_v37.fasta.gz') # doctest: +SKIP Notes ----- This method can only be run once per reference genome. Use :meth:`~has_sequence` to test whether a sequence is loaded. FASTA and index files are hosted on google cloud for some of Hail's built-in references: **GRCh37** - FASTA file: ``gs://hail-common/references/human_g1k_v37.fasta.gz`` - Index file: ``gs://hail-common/references/human_g1k_v37.fasta.fai`` **GRCh38** - FASTA file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.gz`` - Index file: ``gs://hail-common/references/Homo_sapiens_assembly38.fasta.fai`` Public download links are available `here <https://console.cloud.google.com/storage/browser/hail-common/references/>`__. Parameters ---------- fasta_file : :obj:`str` Path to FASTA file. Can be compressed (GZIP) or uncompressed. index_file : :obj:`None` or :obj:`str` Path to FASTA index file. Must be uncompressed. If `None`, replace the fasta_file's extension with `fai`. """ if index_file is None: index_file = re.sub('\.[^.]*$', '.fai', fasta_file) Env.backend().add_sequence(self.name, fasta_file, index_file) self._has_sequence = True
def test_parses(self): env = {'c': hl.tbool, 'a': hl.tarray(hl.tint32), 'aa': hl.tarray(hl.tarray(hl.tint32)), 'da': hl.tarray(hl.ttuple(hl.tint32, hl.tstr)), 'v': hl.tint32, 's': hl.tstruct(x=hl.tint32, y=hl.tint64, z=hl.tfloat64), 't': hl.ttuple(hl.tint32, hl.tint64, hl.tfloat64), 'call': hl.tcall, 'x': hl.tint32} env = {name: t._jtype for name, t in env.items()} for x in self.value_irs(): Env.hail().expr.ir.IRParser.parse_value_ir(str(x), env, {})
def test_matrix_ir_parses(self): hl.index_bgen(resource('example.8bits.bgen'), reference_genome=hail.get_reference('GRCh37'), contig_recoding={'01': '1'}) collect_sig = ir.AggSignature('Collect', [], None, [hl.tint32]) collect = ir.MakeStruct([('x', ir.ApplyAggOp([], None, [ir.I32(0)], collect_sig))]) matrix_read = ir.MatrixRead( resource('backward_compatability/1.0.0/matrix_table/0.hmt'), False, False) table_read = ir.TableRead(resource('backward_compatability/1.0.0/table/0.ht'), False, None) matrix_irs = [ ir.MatrixRepartition(ir.MatrixRange(5, 5, 1), 100, True), ir.MatrixUnionRows(ir.MatrixRange(5, 5, 1), ir.MatrixRange(5, 5, 1)), ir.MatrixDistinctByRow(ir.MatrixRange(5, 5, 1)), ir.CastTableToMatrix( ir.CastMatrixToTable(matrix_read, '__entries', '__cols'), '__entries', '__cols', []), ir.MatrixAggregateRowsByKey(matrix_read, collect, collect), ir.MatrixAggregateColsByKey(matrix_read, collect, collect), ir.MatrixRange(1, 1, 10), ir.MatrixImportVCF([resource('sample.vcf')], False, False, None, None, False, ['GT'], hail.get_reference('GRCh37'), {}, True, False), ir.MatrixImportBGEN([resource('example.8bits.bgen')], ['GP'], resource('example.sample'), {}, 10, 1, ['varid'], None), ir.MatrixFilterRows(matrix_read, ir.FalseIR()), ir.MatrixFilterCols(matrix_read, ir.FalseIR()), ir.MatrixFilterEntries(matrix_read, ir.FalseIR()), ir.MatrixChooseCols(matrix_read, [1, 0]), ir.MatrixMapCols(matrix_read, ir.MakeStruct([('x', ir.I64(20))]), ['x']), ir.MatrixKeyRowsBy(matrix_read, ['row_i64'], False), ir.MatrixMapRows(ir.MatrixKeyRowsBy(matrix_read, []), ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapEntries(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.MatrixMapGlobals(matrix_read, ir.MakeStruct([('x', ir.I64(20))])), ir.TableToMatrixTable(table_read, ['f32', 'i64'], ['m', 'astruct'], ['aset'], ['mset'], 100), ir.MatrixCollectColsByKey(matrix_read), ir.MatrixExplodeRows(matrix_read, ['row_aset']), ir.MatrixExplodeCols(matrix_read, ['col_aset']), ir.MatrixAnnotateRowsTable(matrix_read, table_read, '__foo', None), ir.MatrixAnnotateColsTable(matrix_read, table_read, '__foo'), ] for x in matrix_irs: try: Env.hail().expr.ir.IRParser.parse_matrix_ir(str(x)) except Exception as e: raise ValueError(str(x)) from e
def set_global_seed(seed): """Sets Hail's global seed to `seed`. Parameters ---------- seed : :obj:`int` Integer used to seed Hail's random number generator Returns ------- :class:`.ReferenceGenome` """ Env.set_seed(seed)
def enable_pipeline_upload(): """Upload all subsequent pipelines to the Hail team in order to help improve Hail. Pipeline upload can also be enabled by setting the environment variable `HAIL_ENABLE_PIPELINE_UPLOAD` or the Spark configuration property `hail.enablePipelineUpload` to `true`. Warning ------- Shares potentially sensitive data with the Hail team. """ Env.hc()._jhc.enablePipelineUpload()
def localize_entries_with_none_entries_changes_no_rows(self): mt = hl.utils.range_matrix_table(10, 10) mt = mt.select_entries(x = mt.row_idx * mt.col_idx) localized = mt.localize_entries(entries_array_field_name=None, columns_array_field_name=Env.get_uid()) rows_table = mt.rows() assert rows_table.collect() == localized.collect()
def multitrait_inf(mt, h2=None, rg=None, cov_matrix=None, seed=None): """Generates correlated betas for multi-trait infinitesimal simulations for any number of phenotypes. Parameters ---------- mt : :class:`.MatrixTable` MatrixTable for simulated phenotype. h2 : :obj:`float` or :obj:`int` or :obj:`list`, optional Desired SNP-based heritability (:math:`h^2`) of simulated traits. If `h2` is ``None``, :math:`h^2` is based on diagonal of `cov_matrix`. rg : :obj:`float` or :obj:`int` or :obj:`list`, optional Desired genetic correlation (:math:`r_g`) between simulated traits. If simulating more than two correlated traits, `rg` should be a list of :math:`rg` values corresponding to the upper right triangle of the covariance matrix. If `rg` is ``None`` and `cov_matrix` is ``None``, :math:`r_g` is assumed to be 0 between traits. If `rg` and `cov_matrix` are both not None, :math:`r_g` values from `cov_matrix` take precedence. cov_matrix : :class:`numpy.ndarray`, optional Covariance matrix for traits, **unscaled by :math:`M`**, the number of SNPs. Overrides `h2` and `rg` even when `h2` or `rg` are not ``None``. seed : :obj:`int`, optional Seed for random number generator. If `seed` is ``None``, `seed` is set randomly. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated SNP effects as a row field of arrays. """ tid = ''.join( random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5) ) # "temporary id" -- random string to identify temporary intermediate fields generated by this method h2 = [h2] if type(h2) is not list else h2 rg = [rg] if type(rg) is not list else rg assert (all(x >= 0 and x <= 1 for x in h2)), 'h2 values must be between 0 and 1' assert h2 is not [ None ] or cov_matrix is not None, 'h2 and cov_matrix cannot both be None' seed = seed if seed is not None else int(str(Env.next_seed())[:8]) M = mt.count_rows() if cov_matrix != None: n_phens = cov_matrix.shape[0] else: n_phens = len(h2) if rg == [None]: print(f'Assuming rg=0 for all {n_phens} traits') rg = [0] * int((n_phens**2 - n_phens) / 2) assert (all(x >= -1 and x <= 1 for x in rg)), 'rg values must be between 0 and 1' cov_matrix = create_cov_matrix(h2, rg) cov_matrix = (1 / M) * cov_matrix randstate = np.random.RandomState( int(seed)) #seed random state for replicability betas = randstate.multivariate_normal(mean=np.zeros(n_phens), cov=cov_matrix, size=[ M, ]) df = pd.DataFrame([0] * M, columns=['beta']) tb = hl.Table.from_pandas(df) tb = tb.add_index().key_by('idx') tb = tb.annotate(beta=hl.literal(betas.tolist())[hl.int32(tb.idx)]) mt = mt.add_row_index(name='row_idx' + tid) mt = mt.annotate_rows(beta=tb[mt['row_idx' + tid]]['beta']) mt = _clean_fields(mt, tid) return mt
def vep(dataset: Union[Table, MatrixTable], config, block_size=1000, name='vep', csq=False): """Annotate variants with VEP. .. include:: ../_templates/req_tvariant.rst :func:`.vep` runs `Variant Effect Predictor <http://www.ensembl.org/info/docs/tools/vep/index.html>`__ on the current dataset and adds the result as a row field. Examples -------- Add VEP annotations to the dataset: >>> result = hl.vep(dataset, "data/vep-configuration.json") # doctest: +SKIP Notes ----- **Configuration** :func:`.vep` needs a configuration file to tell it how to run VEP. The format of the configuration file is JSON, and :func:`.vep` expects a JSON object with three fields: - `command` (array of string) -- The VEP command line to run. The string literal `__OUTPUT_FORMAT_FLAG__` is replaced with `--json` or `--vcf` depending on `csq`. - `env` (object) -- A map of environment variables to values to add to the environment when invoking the command. The value of each object member must be a string. - `vep_json_schema` (string): The type of the VEP JSON schema (as produced by the VEP when invoked with the `--json` option). Note: This is the old-style 'parseable' Hail type syntax. This will change. Here is an example configuration file for invoking VEP release 85 installed in `/vep` with the Loftee plugin: .. code-block:: text { "command": [ "/vep", "--format", "vcf", "__OUTPUT_FORMAT_FLAG__", "--everything", "--allele_number", "--no_stats", "--cache", "--offline", "--minimal", "--assembly", "GRCh37", "--plugin", "LoF,human_ancestor_fa:/root/.vep/loftee_data/human_ancestor.fa.gz,filter_position:0.05,min_intron_size:15,conservation_file:/root/.vep/loftee_data/phylocsf_gerp.sql,gerp_file:/root/.vep/loftee_data/GERP_scores.final.sorted.txt.gz", "-o", "STDOUT" ], "env": { "PERL5LIB": "/vep_data/loftee" }, "vep_json_schema": "Struct{assembly_name:String,allele_string:String,ancestral:String,colocated_variants:Array[Struct{aa_allele:String,aa_maf:Float64,afr_allele:String,afr_maf:Float64,allele_string:String,amr_allele:String,amr_maf:Float64,clin_sig:Array[String],end:Int32,eas_allele:String,eas_maf:Float64,ea_allele:String,ea_maf:Float64,eur_allele:String,eur_maf:Float64,exac_adj_allele:String,exac_adj_maf:Float64,exac_allele:String,exac_afr_allele:String,exac_afr_maf:Float64,exac_amr_allele:String,exac_amr_maf:Float64,exac_eas_allele:String,exac_eas_maf:Float64,exac_fin_allele:String,exac_fin_maf:Float64,exac_maf:Float64,exac_nfe_allele:String,exac_nfe_maf:Float64,exac_oth_allele:String,exac_oth_maf:Float64,exac_sas_allele:String,exac_sas_maf:Float64,id:String,minor_allele:String,minor_allele_freq:Float64,phenotype_or_disease:Int32,pubmed:Array[Int32],sas_allele:String,sas_maf:Float64,somatic:Int32,start:Int32,strand:Int32}],context:String,end:Int32,id:String,input:String,intergenic_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],impact:String,minimised:Int32,variant_allele:String}],most_severe_consequence:String,motif_feature_consequences:Array[Struct{allele_num:Int32,consequence_terms:Array[String],high_inf_pos:String,impact:String,minimised:Int32,motif_feature_id:String,motif_name:String,motif_pos:Int32,motif_score_change:Float64,strand:Int32,variant_allele:String}],regulatory_feature_consequences:Array[Struct{allele_num:Int32,biotype:String,consequence_terms:Array[String],impact:String,minimised:Int32,regulatory_feature_id:String,variant_allele:String}],seq_region_name:String,start:Int32,strand:Int32,transcript_consequences:Array[Struct{allele_num:Int32,amino_acids:String,biotype:String,canonical:Int32,ccds:String,cdna_start:Int32,cdna_end:Int32,cds_end:Int32,cds_start:Int32,codons:String,consequence_terms:Array[String],distance:Int32,domains:Array[Struct{db:String,name:String}],exon:String,gene_id:String,gene_pheno:Int32,gene_symbol:String,gene_symbol_source:String,hgnc_id:String,hgvsc:String,hgvsp:String,hgvs_offset:Int32,impact:String,intron:String,lof:String,lof_flags:String,lof_filter:String,lof_info:String,minimised:Int32,polyphen_prediction:String,polyphen_score:Float64,protein_end:Int32,protein_start:Int32,protein_id:String,sift_prediction:String,sift_score:Float64,strand:Int32,swissprot:String,transcript_id:String,trembl:String,uniparc:String,variant_allele:String}],variant_class:String}" } **Annotations** A new row field is added in the location specified by `name` with type given by the type given by the `json_vep_schema` (if `csq` is ``False``) or :py:data:`.tstr` (if `csq` is ``True``). If csq is ``True``, then the CSQ header string is also added as a global field with name ``name + '_csq_header'``. Parameters ---------- dataset : :class:`.MatrixTable` or :class:`.Table` Dataset. config : :obj:`str` Path to VEP configuration file. block_size : :obj:`int` Number of rows to process per VEP invocation. name : :obj:`str` Name for resulting row field. csq : :obj:`bool` If ``True``, annotates with the VCF CSQ field as a :py:data:`.tstr`. If ``False``, annotates as the `vep_json_schema`. Returns ------- :class:`.MatrixTable` or :class:`.Table` Dataset with new row-indexed field `name` containing VEP annotations. """ if isinstance(dataset, MatrixTable): require_row_key_variant(dataset, 'vep') ht = dataset.select_rows().rows() else: require_table_key_variant(dataset, 'vep') ht = dataset.select() annotations = Table(Env.hail().methods.VEP.apply(ht._jt, config, csq, block_size)) if csq: dataset = dataset.annotate_globals(**{ name + '_csq_header': annotations.index_globals()['vep_csq_header'] }) if isinstance(dataset, MatrixTable): return dataset.annotate_rows( **{name: annotations[dataset.row_key].vep}) else: return dataset.annotate(**{name: annotations[dataset.key].vep})
def __init__(self, path, buffer_size): self._jfile = Env.jutils().readFile(path, Env.hc()._jhc, buffer_size) super(HadoopReader, self).__init__()
def nirvana(dataset: Union[MatrixTable, Table], config, block_size=500000, name='nirvana'): """Annotate variants using `Nirvana <https://github.com/Illumina/Nirvana>`_. .. include:: ../_templates/experimental.rst .. include:: ../_templates/req_tvariant.rst :func:`.nirvana` runs `Nirvana <https://github.com/Illumina/Nirvana>`_ on the current dataset and adds a new row field in the location specified by `name`. Examples -------- Add Nirvana annotations to the dataset: >>> result = hl.nirvana(dataset, "data/nirvana.properties") # doctest: +SKIP **Configuration** :func:`.nirvana` requires a configuration file. The format is a `.properties file <https://en.wikipedia.org/wiki/.properties>`__, where each line defines a property as a key-value pair of the form ``key = value``. :func:`.nirvana` supports the following properties: - **hail.nirvana.dotnet** -- Location of dotnet. Optional, default: dotnet. - **hail.nirvana.path** -- Value of the PATH environment variable when invoking Nirvana. Optional, by default PATH is not set. - **hail.nirvana.location** -- Location of Nirvana.dll. Required. - **hail.nirvana.reference** -- Location of reference genome. Required. - **hail.nirvana.cache** -- Location of cache. Required. - **hail.nirvana.supplementaryAnnotationDirectory** -- Location of Supplementary Database. Optional, no supplementary database by default. Here is an example ``nirvana.properties`` configuration file: .. code-block:: text hail.nirvana.location = /path/to/dotnet/netcoreapp2.0/Nirvana.dll hail.nirvana.reference = /path/to/nirvana/References/Homo_sapiens.GRCh37.Nirvana.dat hail.nirvana.cache = /path/to/nirvana/Cache/GRCh37/Ensembl hail.nirvana.supplementaryAnnotationDirectory = /path/to/nirvana/SupplementaryDatabase/GRCh37 **Annotations** A new row field is added in the location specified by `name` with the following schema: .. code-block:: text struct { chromosome: str, refAllele: str, position: int32, altAlleles: array<str>, cytogeneticBand: str, quality: float64, filters: array<str>, jointSomaticNormalQuality: int32, copyNumber: int32, strandBias: float64, recalibratedQuality: float64, variants: array<struct { altAllele: str, refAllele: str, chromosome: str, begin: int32, end: int32, phylopScore: float64, isReferenceMinor: bool, variantType: str, vid: str, hgvsg: str, isRecomposedVariant: bool, isDecomposedVariant: bool, regulatoryRegions: array<struct { id: str, type: str, consequence: set<str> }>, clinvar: array<struct { id: str, reviewStatus: str, isAlleleSpecific: bool, alleleOrigins: array<str>, refAllele: str, altAllele: str, phenotypes: array<str>, medGenIds: array<str>, omimIds: array<str>, orphanetIds: array<str>, significance: str, lastUpdatedDate: str, pubMedIds: array<str> }>, cosmic: array<struct { id: str, isAlleleSpecific: bool, refAllele: str, altAllele: str, gene: str, sampleCount: int32, studies: array<struct { id: int32, histology: str, primarySite: str }> }>, dbsnp: struct { ids: array<str> }, globalAllele: struct { globalMinorAllele: str, globalMinorAlleleFrequency: float64 }, gnomad: struct { coverage: str, allAf: float64, allAc: int32, allAn: int32, allHc: int32, afrAf: float64, afrAc: int32, afrAn: int32, afrHc: int32, amrAf: float64, amrAc: int32, amrAn: int32, amrHc: int32, easAf: float64, easAc: int32, easAn: int32, easHc: int32, finAf: float64, finAc: int32, finAn: int32, finHc: int32, nfeAf: float64, nfeAc: int32, nfeAn: int32, nfeHc: int32, othAf: float64, othAc: int32, othAn: int32, othHc: int32, asjAf: float64, asjAc: int32, asjAn: int32, asjHc: int32, failedFilter: bool }, gnomadExome: struct { coverage: str, allAf: float64, allAc: int32, allAn: int32, allHc: int32, afrAf: float64, afrAc: int32, afrAn: int32, afrHc: int32, amrAf: float64, amrAc: int32, amrAn: int32, amrHc: int32, easAf: float64, easAc: int32, easAn: int32, easHc: int32, finAf: float64, finAc: int32, finAn: int32, finHc: int32, nfeAf: float64, nfeAc: int32, nfeAn: int32, nfeHc: int32, othAf: float64, othAc: int32, othAn: int32, othHc: int32, asjAf: float64, asjAc: int32, asjAn: int32, asjHc: int32, sasAf: float64, sasAc: int32, sasAn: int32, sasHc: int32, failedFilter: bool }, topmed: struct { failedFilter: bool, allAc: int32, allAn: int32, allAf: float64, allHc: int32 }, oneKg: struct { ancestralAllele: str, allAf: float64, allAc: int32, allAn: int32, afrAf: float64, afrAc: int32, afrAn: int32, amrAf: float64, amrAc: int32, amrAn: int32, easAf: float64, easAc: int32, easAn: int32, eurAf: float64, eurAc: int32, eurAn: int32, sasAf: float64, sasAc: int32, sasAn: int32 }, mitomap: array<struct { refAllele: str, altAllele: str, diseases : array<str>, hasHomoplasmy: bool, hasHeteroplasmy: bool, status: str, clinicalSignificance: str, scorePercentile: float64, isAlleleSpecific: bool, chromosome: str, begin: int32, end: int32, variantType: str } transcripts: struct { refSeq: array<struct { transcript: str, bioType: str, aminoAcids: str, cdnaPos: str, codons: str, cdsPos: str, exons: str, introns: str, geneId: str, hgnc: str, consequence: array<str>, hgvsc: str, hgvsp: str, isCanonical: bool, polyPhenScore: float64, polyPhenPrediction: str, proteinId: str, proteinPos: str, siftScore: float64, siftPrediction: str }>, ensembl: array<struct { transcript: str, bioType: str, aminoAcids: str, cdnaPos: str, codons: str, cdsPos: str, exons: str, introns: str, geneId: str, hgnc: str, consequence: array<str>, hgvsc: str, hgvsp: str, isCanonical: bool, polyPhenScore: float64, polyPhenPrediction: str, proteinId: str, proteinPos: str, siftScore: float64, siftPrediction: str }> }, overlappingGenes: array<str> }> genes: array<struct { name: str, omim: array<struct { mimNumber: int32, hgnc: str, description: str, phenotypes: array<struct { mimNumber: int32, phenotype: str, mapping: str, inheritance: array<str>, comments: str }> }> exac: struct { pLi: float64, pRec: float64, pNull: float64 } }> } Parameters ---------- dataset : :class:`.MatrixTable` or :class:`.Table` Dataset. config : :obj:`str` Path to Nirvana configuration file. block_size : :obj:`int` Number of rows to process per Nirvana invocation. name : :obj:`str` Name for resulting row field. Returns ------- :class:`.MatrixTable` or :class:`.Table` Dataset with new row-indexed field `name` containing Nirvana annotations. """ if isinstance(dataset, MatrixTable): require_row_key_variant(dataset, 'nirvana') ht = dataset.select_rows().rows() else: require_table_key_variant(dataset, 'nirvana') ht = dataset.select() annotations = Table(Env.hail().methods.Nirvana.apply( ht._jt, config, block_size)) if isinstance(dataset, MatrixTable): return dataset.annotate_rows( **{name: annotations[dataset.row_key].nirvana}) else: return dataset.annotate(**{name: annotations[dataset.key].nirvana})
def stop(): """Stop the currently running Hail session.""" if Env._hc: Env.hc().stop()
def parse(self, code, ref_map={}, ir_map={}): return Env.hail().expr.ir.IRParser.parse_value_ir( code, {k: t._parsable_string() for k, t in ref_map.items()}, ir_map)
def loop(f: Callable, typ, *args): r"""Define and call a tail-recursive function with given arguments. Notes ----- The argument `f` must be a function where the first argument defines the recursive call, and the remaining arguments are the arguments to the recursive function, e.g. to define the recursive function .. math:: f(x, y) = \begin{cases} y & \textrm{if } x \equiv 0 \\ f(x - 1, y + x) & \textrm{otherwise} \end{cases} we would write: >>> f = lambda recur, x, y: hl.if_else(x == 0, y, recur(x - 1, y + x)) Full recursion is not supported, and any non-tail-recursive methods will throw an error when called. This means that the result of any recursive call within the function must also be the result of the entire function, without modification. Let's consider two different recursive definitions for the triangle function :math:`f(x) = 0 + 1 + \dots + x`: >>> def triangle1(x): ... if x == 1: ... return x ... return x + triangle1(x - 1) >>> def triangle2(x, total): ... if x == 0: ... return total ... return triangle2(x - 1, total + x) The first function definition, `triangle1`, will call itself and then add x. This is an example of a non-tail recursive function, since `triangle1(9)` needs to modify the result of the inner recursive call to `triangle1(8)` by adding 9 to the result. The second function is tail recursive: the result of `triangle2(9, 0)` is the same as the result of the inner recursive call, `triangle2(8, 9)`. Example ------- To find the sum of all the numbers from n=1...10: >>> triangle_f = lambda f, x, total: hl.if_else(x == 0, total, f(x - 1, total + x)) >>> x = hl.experimental.loop(triangle_f, hl.tint32, 10, 0) >>> hl.eval(x) 55 Let's say we want to find the root of a polynomial equation: >>> def polynomial(x): ... return 5 * x**3 - 2 * x - 1 We'll use `Newton's method<https://en.wikipedia.org/wiki/Newton%27s_method>` to find it, so we'll also define the derivative: >>> def derivative(x): ... return 15 * x**2 - 2 and starting at :math:`x_0 = 0`, we'll compute the next step :math:`x_{i+1} = x_i - \frac{f(x_i)}{f'(x_i)}` until the difference between :math:`x_{i}` and :math:`x_{i+1}` falls below our convergence threshold: >>> threshold = 0.005 >>> def find_root(f, guess, error): ... converged = hl.is_defined(error) & (error < threshold) ... new_guess = guess - (polynomial(guess) / derivative(guess)) ... new_error = hl.abs(new_guess - guess) ... return hl.if_else(converged, guess, f(new_guess, new_error)) >>> x = hl.experimental.loop(find_root, hl.tfloat, 0.0, hl.missing(hl.tfloat)) >>> hl.eval(x) 0.8052291984599675 Warning ------- Using arguments of a type other than numeric types and booleans can cause memory issues if if you expect the recursive call to happen many times. Parameters ---------- f : function ( (marker, \*args) -> :class:`.Expression` Function of one callable marker, denoting where the recursive call (or calls) is located, and many `args`, the loop variables. typ : :class:`str` or :class:`.HailType` Type the loop returns. args : variable-length args of :class:`.Expression` Expressions to initialize the loop values. Returns ------- :class:`.Expression` Result of the loop with `args` as initial loop values. """ loop_name = Env.get_uid() def contains_recursive_call(non_recursive): if isinstance(non_recursive, ir.Recur) and non_recursive.name == loop_name: return True return any( [contains_recursive_call(c) for c in non_recursive.children]) def check_tail_recursive(loop_ir): if isinstance(loop_ir, ir.If): if contains_recursive_call(loop_ir.cond): raise TypeError( "branch condition can't contain recursive call!") check_tail_recursive(loop_ir.cnsq) check_tail_recursive(loop_ir.altr) elif isinstance(loop_ir, ir.Let): if contains_recursive_call(loop_ir.value): raise TypeError( "bound value used in other expression can't contain recursive call!" ) check_tail_recursive(loop_ir.body) elif isinstance(loop_ir, ir.TailLoop): if any(contains_recursive_call(x) for n, x in loop_ir.params): raise TypeError( "parameters passed to inner loop can't contain recursive call!" ) elif not isinstance(loop_ir, ir.Recur) and contains_recursive_call(loop_ir): raise TypeError( "found recursive expression outside of tail position!") @typecheck(recur_exprs=expr_any) def make_loop(*recur_exprs): if len(recur_exprs) != len(args): raise TypeError( 'Recursive call in loop has wrong number of arguments') err = None for i, (rexpr, expr) in enumerate(zip(recur_exprs, args)): if rexpr.dtype != expr.dtype: if err is None: err = 'Type error in recursive call,' err += f'\n at argument index {i}, loop arg type: {expr.dtype}, ' err += f'recur arg type: {rexpr.dtype}' if err is not None: raise TypeError(err) irs = [expr._ir for expr in recur_exprs] indices, aggregations = unify_all(*recur_exprs) return construct_expr(ir.Recur(loop_name, irs, typ), typ, indices, aggregations) uid_irs = [] loop_vars = [] for expr in args: uid = Env.get_uid() loop_vars.append( construct_variable(uid, expr._type, expr._indices, expr._aggregations)) uid_irs.append((uid, expr._ir)) loop_f = to_expr(f(make_loop, *loop_vars)) if loop_f.dtype != typ: raise TypeError( f"requested type {typ} does not match inferred type {loop_f.dtype}" ) check_tail_recursive(loop_f._ir) indices, aggregations = unify_all(*args, loop_f) return construct_expr(ir.TailLoop(loop_name, loop_f._ir, uid_irs), loop_f.dtype, indices, aggregations)
def __init__(self, path, exclusive=False): self._jfile = Env.jutils().writeFile(path, Env.hc()._jhc, exclusive) super(HadoopWriter, self).__init__()
def copy(self, src: str, dest: str): Env.jutils().copyFile(src, dest, Env.hc()._jhc)
def parse(self, code, ref_map={}, ir_map={}): return Env.hail().expr.ir.IRParser.parse_blockmatrix_ir( code, ref_map, ir_map)
def parse(self, code, ref_map={}, ir_map={}): return Env.hail().expr.ir.IRParser.parse_table_ir( code, ref_map, ir_map)
def fit_alternatives(self, pa_t_path, a_t_path=None, partition_size=None): r"""Fit and test alternative model for each augmented design matrix in parallel. Notes ----- The alternative model is fit using REML constrained to the value of :math:`\gamma` set by :meth:`fit`. The likelihood ratio test of fixed effect parameter :math:`\beta_\star` uses (non-restricted) maximum likelihood: .. math:: \chi^2 = 2 \log\left(\frac{ \max_{\beta_\star, \beta, \sigma^2}\mathrm{N} (y \, | \, x_\star \beta_\star + X \beta; \sigma^2(K + \gamma^{-1}I)} {\max_{\beta, \sigma^2} \mathrm{N} (y \, | \, x_\star \cdot 0 + X \beta; \sigma^2(K + \gamma^{-1}I)} \right) The p-value is given by the tail probability under a chi-squared distribution with one degree of freedom. The resulting table has the following fields: .. list-table:: :header-rows: 1 * - Field - Type - Value * - `idx` - int64 - Index of augmented design matrix. * - `beta` - float64 - :math:`\beta_\star` * - `sigma_sq` - float64 - :math:`\sigma^2` * - `chi_sq` - float64 - :math:`\chi^2` * - `p_value` - float64 - p-value :math:`(P_r A)^T` and :math:`A^T` (if given) must have the same number of rows (augmentations). These rows are grouped into partitions for parallel processing. The number of partitions equals the ceiling of ``n_rows / partition_size``, and should be at least the number or cores to make use of all cores. By default, there is one partition per row of blocks in :math:`(P_r A)^T`. Setting the partition size to an exact (rather than approximate) divisor or multiple of the block size reduces superfluous shuffling of data. The number of columns in each block matrix must be less than :math:`2^{31}`. Warning ------- The block matrices must be stored in row-major format, as results from :meth:`.BlockMatrix.write` with ``force_row_major=True`` and from :meth:`.BlockMatrix.write_from_entry_expr`. Otherwise, this method will produce an error message. Parameters ---------- pa_t_path: :obj:`str` Path to block matrix :math:`(P_r A)^T` with shape :math:`(m, r)`. Each row is a projected augmentation :math:`P_r x_\star` of :math:`P_r X`. a_t_path: :obj:`str`, optional Path to block matrix :math:`A^T` with shape :math:`(m, n)`. Each row is an augmentation :math:`x_\star` of :math:`X`. Include for low-rank inference. partition_size: :obj:`int`, optional Number of rows to process per partition. Default given by block size of :math:`(P_r A)^T`. Returns ------- :class:`.Table` Table of results for each augmented design matrix. """ from hail.table import Table self._check_dof(self.f + 1) if self.low_rank and a_t_path is None: raise ValueError('model is low-rank so a_t is required.') elif not (self.low_rank or a_t_path is None): raise ValueError('model is full-rank so a_t must not be set.') if self._scala_model is None: self._set_scala_model() if partition_size is None: block_size = Env.hail().linalg.BlockMatrix.readMetadata( Env.hc()._jhc, pa_t_path).blockSize() partition_size = block_size elif partition_size <= 0: raise ValueError( f'partition_size must be positive, found {partition_size}') jpa_t = Env.hail().linalg.RowMatrix.readBlockMatrix( Env.hc()._jhc, pa_t_path, jsome(partition_size)) if a_t_path is None: maybe_ja_t = jnone() else: maybe_ja_t = jsome(Env.hail().linalg.RowMatrix.readBlockMatrix( Env.hc()._jhc, a_t_path, jsome(partition_size))) return Table._from_java(self._scala_model.fit(jpa_t, maybe_ja_t))
def _get_flags(*flags): return {flag: Env.backend()._jhc.flags().get(flag) for flag in flags}
def current_backend(): return Env.hc()._backend
def uri_path(uri): return Env.jutils().uriPath(uri)
def exists(self, path: str) -> bool: return Env.jutils().exists(path, Env.hc()._jhc)
def new_temp_file(suffix=None, prefix=None, n_char=10): return Env.hc()._jhc.getTemporaryFile(n_char, joption(prefix), joption(suffix))
def require_biallelic(dataset, method) -> MatrixTable: require_row_key_variant(dataset, method) dataset = MatrixTable(Env.hail().methods.VerifyBiallelic.apply( dataset._jvds, method)) return dataset
def to_hql(self): return '("{regex}" ~ {string})'.format( regex=Env.jutils().escapePyString(self.regex), string=self.string.to_hql())
def test_type_jvm_roundtrip(self): ts = self.types_to_test() for t in ts: rev_str = t._parsable_string() jtyp = Env.hail().expr.ir.IRParser.parseType(rev_str) self.assertEqual(t, dtype(jtyp.toString()))
def filter_intervals(ds, intervals, keep=True) -> MatrixTable: """Filter rows with a list of intervals. Examples -------- Filter to loci falling within one interval: >>> ds_result = hl.filter_intervals(dataset, [hl.parse_locus_interval('17:38449840-38530994')]) Remove all loci within list of intervals: >>> intervals = [hl.parse_locus_interval(x) for x in ['1:50M-75M', '2:START-400000', '3-22']] >>> ds_result = hl.filter_intervals(dataset, intervals) Notes ----- Based on the ``keep`` argument, this method will either restrict to points in the supplied interval ranges, or remove all rows in those ranges. When ``keep=True``, partitions that don't overlap any supplied interval will not be loaded at all. This enables :func:`.filter_intervals` to be used for reasonably low-latency queries of small ranges of the dataset, even on large datasets. Parameters ---------- ds : :class:`.MatrixTable` Dataset. intervals : :class:`.ArrayExpression` of type :py:data:`.tinterval` Intervals to filter on. If there is only one row partition key, the point type of the interval can be the type of the first partition key. Otherwise, the interval point type must be a :class:`.Struct` matching the row partition key schema. keep : :obj:`bool` If ``True``, keep only rows that fall within any interval in `intervals`. If ``False``, keep only rows that fall outside all intervals in `intervals`. Returns ------- :class:`.MatrixTable` """ n_pk = len(ds.partition_key) pk_type = ds.partition_key.dtype point_type = intervals.dtype.element_type.point_type if point_type == pk_type: needs_wrapper = False elif n_pk == 1 and point_type == ds.partition_key[0].dtype: needs_wrapper = True else: raise TypeError( "The point type does not match the row partition key type of the dataset ('{}', '{}')" .format(repr(point_type), repr(pk_type))) def wrap_input(interval): if interval is None: raise TypeError( "'filter_intervals' does not allow missing values in 'intervals'." ) elif needs_wrapper: return Interval(Struct(foo=interval.start), Struct(foo=interval.end), interval.includes_start, interval.includes_end) else: return interval intervals = [wrap_input(x)._jrep for x in intervals.value] jmt = Env.hail().methods.FilterIntervals.apply(ds._jvds, intervals, keep) return MatrixTable(jmt)
def sample_qc(mt, name='sample_qc') -> MatrixTable: """Compute per-sample metrics useful for quality control. .. include:: ../_templates/req_tvariant.rst Examples -------- Compute sample QC metrics and remove low-quality samples: >>> dataset = hl.sample_qc(dataset, name='sample_qc') >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5)) Notes ----- This method computes summary statistics per sample from a genetic matrix and stores the results as a new column-indexed struct field in the matrix, named based on the `name` parameter. If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats` and `gq_stats` are structs with with four fields: - `mean` (``float64``) -- Mean value. - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom). - `min` (``int32``) -- Minimum value. - `max` (``int32``) -- Maximum value. If the dataset does not contain an entry field `GT` of type :py:data:`.tcall`, then an error is raised. The following fields are always computed from `GT`: - `call_rate` (``float64``) -- Fraction of calls not missing or filtered. Equivalent to `n_called` divided by :meth:`.count_rows`. - `n_called` (``int64``) -- Number of non-missing calls. - `n_not_called` (``int64``) -- Number of missing calls. - `n_filtered` (``int64``) -- Number of filtered entries. - `n_hom_ref` (``int64``) -- Number of homozygous reference calls. - `n_het` (``int64``) -- Number of heterozygous calls. - `n_hom_var` (``int64``) -- Number of homozygous alternate calls. - `n_non_ref` (``int64``) -- Sum of `n_het` and `n_hom_var`. - `n_snp` (``int64``) -- Number of SNP alternate alleles. - `n_insertion` (``int64``) -- Number of insertion alternate alleles. - `n_deletion` (``int64``) -- Number of deletion alternate alleles. - `n_singleton` (``int64``) -- Number of private alleles. - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles. - `n_transversion` (``int64``) -- Number of transversion alternate alleles. - `n_star` (``int64``) -- Number of star (upstream deletion) alleles. - `r_ti_tv` (``float64``) -- Transition/Transversion ratio. - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio. - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio. Missing values ``NA`` may result from division by zero. Parameters ---------- mt : :class:`.MatrixTable` Dataset. name : :obj:`str` Name for resulting field. Returns ------- :class:`.MatrixTable` Dataset with a new column-indexed field `name`. """ require_row_key_variant(mt, 'sample_qc') from hail.expr.functions import _num_allele_type, _allele_types allele_types = _allele_types[:] allele_types.extend(['Transition', 'Transversion']) allele_enum = {i: v for i, v in enumerate(allele_types)} allele_ints = {v: k for k, v in allele_enum.items()} def allele_type(ref, alt): return hl.bind( lambda at: hl.cond( at == allele_ints['SNP'], hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'], allele_ints['Transversion']), at), _num_allele_type(ref, alt)) variant_ac = Env.get_uid() variant_atypes = Env.get_uid() mt = mt.annotate_rows( **{ variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC, variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt)) }) bound_exprs = {} gq_dp_exprs = {} def has_field_of_type(name, dtype): return name in mt.entry and mt[name].dtype == dtype if has_field_of_type('DP', hl.tint32): gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select( 'mean', 'stdev', 'min', 'max') if has_field_of_type('GQ', hl.tint32): gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select( 'mean', 'stdev', 'min', 'max') if not has_field_of_type('GT', hl.tcall): raise ValueError( f"'sample_qc': expect an entry field 'GT' of type 'call'") bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT'])) bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT'])) n_rows_ref = hl.expr.construct_expr( hl.ir.Ref('n_rows'), hl.tint64, mt._col_indices, hl.utils.LinkedList(hl.expr.Aggregation)) bound_exprs['n_filtered'] = n_rows_ref - hl.agg.count() bound_exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref()) bound_exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het()) bound_exprs['n_singleton'] = hl.agg.sum( hl.sum( hl.range(0, mt['GT'].ploidy).map( lambda i: mt[variant_ac][mt['GT'][i]] == 1))) def get_allele_type(allele_idx): return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32)) bound_exprs['allele_type_counts'] = hl.agg.explode( lambda elt: hl.agg.counter(elt), hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i]))) zero = hl.int64(0) result_struct = hl.rbind(hl.struct(**bound_exprs), lambda x: hl.rbind( hl.struct(**{ **gq_dp_exprs, 'call_rate': hl.float64(x.n_called) / (x.n_called + x.n_not_called + x.n_filtered), 'n_called': x.n_called, 'n_not_called': x.n_not_called, 'n_filtered': x.n_filtered, 'n_hom_ref': x.n_hom_ref, 'n_het': x.n_het, 'n_hom_var': x.n_called - x.n_hom_ref - x.n_het, 'n_non_ref': x.n_called - x.n_hom_ref, 'n_singleton': x.n_singleton, 'n_snp': x.allele_type_counts.get(allele_ints["Transition"], zero) + \ x.allele_type_counts.get(allele_ints["Transversion"], zero), 'n_insertion': x.allele_type_counts.get(allele_ints["Insertion"], zero), 'n_deletion': x.allele_type_counts.get(allele_ints["Deletion"], zero), 'n_transition': x.allele_type_counts.get(allele_ints["Transition"], zero), 'n_transversion': x.allele_type_counts.get(allele_ints["Transversion"], zero), 'n_star': x.allele_type_counts.get(allele_ints["Star"], zero) }), lambda s: s.annotate( r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion), r_het_hom_var=divide_null(hl.float64(s.n_het), s.n_hom_var), r_insertion_deletion=divide_null(hl.float64(s.n_insertion), s.n_deletion) ))) mt = mt.annotate_cols(**{name: result_struct}) mt = mt.drop(variant_ac, variant_atypes) return mt
def is_file(self, path: str) -> bool: return Env.jutils().isFile(path, Env.hc()._jhc)
def test_parses(self): for x in self.value_irs(): Env.hail().expr.Parser.parse_value_ir(str(x))
def __init__(self, value: Any, dtype: 'hail.HailType'): super(Broadcast, self).__init__() self.value = value self.dtype = dtype self.uid = Env.get_uid()
def is_dir(self, path: str) -> bool: return Env.jutils().isDir(path, Env.hc()._jhc)
def stat(self, path: str) -> Dict: return json.loads(Env.jutils().stat(path, Env.hc()._jhc))
def ls(self, path: str) -> List[Dict]: r = Env.jutils().ls(path, Env.hc()._jhc) return json.loads(r)
def _compute_type(self): self._type = Env.backend().table_type(self)
def concordance(left, right) -> Tuple[List[List[int]], Table, Table]: """Calculate call concordance with another dataset. .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst .. include:: ../_templates/req_unphased_diploid_gt.rst Examples -------- Compute concordance between two datasets and output the global concordance statistics and two tables with concordance computed per column key and per row key: >>> global_conc, cols_conc, rows_conc = hl.concordance(dataset, dataset2) Notes ----- This method computes the genotype call concordance (from the entry field **GT**) between two biallelic variant datasets. It requires unique sample IDs and performs an inner join on samples (only samples in both datasets will be considered). In addition, all genotype calls must be **diploid** and **unphased**. It performs an ordered zip join of the variants. That means the variants of each dataset are sorted, with duplicate variants appearing in some random relative order, and then zipped together. When a variant appears a different number of times between the two datasets, the dataset with the fewer number of instances is padded with "no data". For example, if a variant is only in one dataset, then each genotype is treated as "no data" in the other. This method returns a tuple of three objects: a nested list of list of int with global concordance summary statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. **Using the global summary result** The global summary is a list of list of int (conceptually a 5 by 5 matrix), where the indices have special meaning: 0. No Data (missing variant) 1. No Call (missing genotype call) 2. Hom Ref 3. Heterozygous 4. Hom Var The first index is the state in the left dataset and the second index is the state in the right dataset. Typical uses of the summary list are shown below. >>> summary, samples, variants = hl.concordance(dataset, dataset2) >>> left_homref_right_homvar = summary[2][4] >>> left_het_right_missing = summary[3][1] >>> left_het_right_something_else = sum(summary[3][:]) - summary[3][3] >>> total_concordant = summary[2][2] + summary[3][3] + summary[4][4] >>> total_discordant = sum([sum(s[2:]) for s in summary[2:]]) - total_concordant **Using the table results** Table 1: Concordance statistics by column This table contains the column key field of `left`, and the following fields: - `n_discordant` (:py:data:`.tint64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. Table 2: Concordance statistics by row This table contains the row key fields of `left`, and the following fields: - `n_discordant` (:py:data:`.tfloat64`) -- Count of discordant calls (see below for full definition). - `concordance` (:class:`.tarray` of :class:`.tarray` of :py:data:`.tint64`) -- Array of concordance per state on left and right, matching the structure of the global summary defined above. In these tables, the column **n_discordant** is provided as a convenience, because this is often one of the most useful concordance statistics. This value is the number of genotypes which were called (homozygous reference, heterozygous, or homozygous variant) in both datasets, but where the call did not match between the two. The column `concordance` matches the structure of the global summmary, which is detailed above. Once again, the first index into this array is the state on the left, and the second index is the state on the right. For example, ``concordance[1][4]`` is the number of "no call" genotypes on the left that were called homozygous variant on the right. Parameters ---------- left : :class:`.MatrixTable` First dataset to compare. right : :class:`.MatrixTable` Second dataset to compare. Returns ------- (list of list of int, :class:`.Table`, :class:`.Table`) The global concordance statistics, a table with concordance statistics per column key, and a table with concordance statistics per row key. """ require_col_key_str(left, 'concordance, left') require_col_key_str(right, 'concordance, right') left = left.select_rows().select_cols().select_globals().select_entries( 'GT') right = right.select_rows().select_cols().select_globals().select_entries( 'GT') left = require_biallelic(left, "concordance, left") right = require_biallelic(right, "concordance, right") r = Env.hail().methods.CalculateConcordance.apply(left._jvds, right._jvds) j_global_conc = r._1() col_conc = Table(r._2()) row_conc = Table(r._3()) global_conc = [[j_global_conc.apply(j).apply(i) for i in range(5)] for j in range(5)] return global_conc, col_conc, row_conc