def compute_same_hap_log_like(n, p, q, x): res = ( hl.cond( q > 0, hl.fold( lambda i, j: i + j[0] * j[1], 0.0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * e), hl.log10(e) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * e + x * q)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * q), hl.log10(q) * 2 ])), -1e31 # Very large negative value if no q is present )) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.97 - 0.03 * hl.log(distance + 1))) return res
def vep_protein_domain_ann_expr( s: hl.expr.StringExpression) -> hl.expr.DictExpression: """ Parse and annotate protein domain(s) from VEP annotation. Expected StringExpression as input (e.g. 'Pfam:PF13853&Prints:PR00237&PROSITE_profiles:PS50262') It will generate a dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id. :param s: hl.expr.StringExpression :return: hl.expr.DictExpression """ a1 = s.split(delim="&") # keep only well-annotated domain(s) (i.e. <source:domain_id>) a2 = a1.map(lambda x: x.split(delim=":")) a2 = a2.filter(lambda x: x.length() == 2) d = ( hl.case().when( hl.len(a2) > 0, hl.dict( hl.zip( a2.map(lambda x: x[0] ), # TODO: Optimize by scanning array just one. a2.map(lambda x: x[1])))).or_missing()) return d
def add_popmax_expr(freq: hl.expr.ArrayExpression, freq_meta: hl.expr.ArrayExpression, populations: Set[str]) -> hl.expr.ArrayExpression: """ Calculates popmax (add an additional entry into freq with popmax: pop) :param ArrayExpression freq: ArrayExpression of Structs with ['ac', 'an', 'hom'] :param ArrayExpression freq_meta: ArrayExpression of meta dictionaries corresponding to freq :param set of str populations: Set of populations over which to calculate popmax :return: Frequency data with annotated popmax :rtype: ArrayExpression """ pops_to_use = hl.literal(populations) freq = hl.map(lambda x: x[0].annotate(meta=x[1]), hl.zip(freq, freq_meta)) freq_filtered = hl.filter( lambda f: (f.meta.size() == 2) & (f.meta.get('group') == 'adj') & pops_to_use.contains(f.meta.get('pop')) & (f.AC > 0), freq) sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True) return hl.or_missing( hl.len(sorted_freqs) > 0, hl.struct(AC=sorted_freqs[0].AC, AF=sorted_freqs[0].AF, AN=sorted_freqs[0].AN, homozygote_count=sorted_freqs[0].homozygote_count, pop=sorted_freqs[0].meta['pop']))
def to_dense_mt(vds: 'VariantDataset') -> 'MatrixTable': """Creates a single, dense :class:`.MatrixTable` from the split :class:`.VariantDataset` representation. Parameters ---------- vds : :class:`.VariantDataset` Dataset in VariantDataset representation. Returns ------- :class:`.MatrixTable` Dataset in dense MatrixTable representation. """ ref = vds.reference_data ref = ref.drop(*(x for x in ('alleles', 'rsid') if x in ref.row)) var = vds.variant_data refl = ref.localize_entries('_ref_entries') varl = var.localize_entries('_var_entries', '_var_cols') varl = varl.annotate(_variant_defined=True) joined = refl.join(varl.key_by('locus'), how='outer') dr = joined.annotate(dense_ref=hl.or_missing( joined._variant_defined, hl.scan._densify(hl.len(joined._var_cols), joined._ref_entries))) dr = dr.filter(dr._variant_defined) def coalesce_join(ref, var): call_field = 'GT' if 'GT' in var else 'LGT' assert call_field in var, var.dtype shared_fields = [call_field] + list( f for f in ref.dtype if f in var.dtype) shared_field_set = set(shared_fields) var_fields = [f for f in var.dtype if f not in shared_field_set] return hl.if_else( hl.is_defined(var), var.select(*shared_fields, *var_fields), ref.annotate(**{ call_field: hl.call(0, 0) }).select(*shared_fields, **{f: hl.null(var[f].dtype) for f in var_fields})) dr = dr.annotate(_dense=hl.zip( dr._var_entries, dr.dense_ref).map(lambda tuple: coalesce_join( hl.or_missing(tuple[1].END > dr.locus.position, tuple[1]), tuple[0] )), ) dr = dr._key_by_assert_sorted('locus', 'alleles') dr = dr.drop('_var_entries', '_ref_entries', 'dense_ref', '_variant_defined', 'ref_allele') return dr._unlocalize_entries('_dense', '_var_cols', list(var.col_key))
def post_process_gene_map_ht(gene_ht): groups = [ 'pLoF', 'missense|LC', 'pLoF|missense|LC', 'synonymous', 'missense' ] variant_groups = hl.map( lambda group: group.split('\\|').flatmap(lambda csq: gene_ht.variants. get(csq)), groups) gene_ht = gene_ht.transmute(variant_groups=hl.zip( groups, variant_groups)).explode('variant_groups') gene_ht = gene_ht.transmute(annotation=gene_ht.variant_groups[0], variants=hl.sorted(gene_ht.variant_groups[1])) gene_ht = gene_ht.key_by(start=gene_ht.interval.start) return gene_ht.filter(hl.len(gene_ht.variants) > 0)
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def compute_chet_log_like(n, p, q, x): res = (hl.cond((p > 0) & (q > 0), hl.fold( lambda i, j: i + j[0] * j[1], 0, hl.zip(gt_counts, [ hl.log10(x) * 2, hl.log10(2 * x * q), hl.log10(q) * 2, hl.log10(2 * x * p), hl.log10(2 * (p * q + x * e)), hl.log10(2 * q * e), hl.log10(p) * 2, hl.log10(2 * p * e), hl.log10(e) * 2 ])), -1e-31)) # If desired, add distance posterior based on value derived from regression if distance is not None: res = res + hl.max(-6, hl.log10(0.03 + 0.03 * hl.log(distance - 1))) return res
def combine_phenotypes_with_name(mt: hl.MatrixTable, column_field, entry_field, dict_of_columns, new_col_name='grouping', new_entry_name='new_entry', grouping_function=hl.agg.any): """ Group by non-unique fields and apply grouping_function in order to combine entries in MatrixTable. Example: mt = hl.balding_nichols_model(1, 4, 10) mt = mt.annotate_entries(pheno=hl.rand_bool(0.5)) dict_of_columns = {'pheno01': [0, 1], 'pheno03': [0, 3]} entry_field = mt.pheno column_field = mt.sample_idx :param MatrixTable mt: Input MatrixTable :param Expression column_field: Column-indexed Expression to group by :param Expression entry_field: Entry-indexed Expression to which to apply `grouping_function` :param dict of any -> list dict_of_columns: Entry in the lists should be the same type as `column_field` :param str new_col_name: Name for new column key (default 'grouping') :param str new_entry_name: Name for new entry expression (default 'new_entry') :param function grouping_function: Aggregator function to apply to `entry_field` (default hl.agg.any) :return: Re-grouped MatrixTable :rtype: MatrixTable """ dict_of_columns = hl.literal(dict_of_columns) mt = mt._annotate_all(col_exprs={'_col_expr': column_field}, entry_exprs={'_entry_expr': entry_field}) mt = mt.annotate_cols( **{ new_col_name: hl.zip(dict_of_columns.keys(), dict_of_columns.values()).filter( lambda x: x[1].contains(mt._col_expr)).map(lambda x: x[0]) }) mt = mt.explode_cols(new_col_name) return mt.group_cols_by(new_col_name).aggregate( **{new_entry_name: grouping_function(mt._entry_expr)})
def merge_arrays(r_array, v_array): def rewrite_ref(r): ref_block_selector = {} for k, t in merged_schema.items(): if k == 'LA': ref_block_selector[k] = hl.literal([0]) elif k in ('LGT', 'GT'): ref_block_selector[k] = hl.call(0, 0) else: ref_block_selector[k] = r[k] if k in r else hl.missing(t) return r.select(**ref_block_selector) def rewrite_var(v): return v.select(**{ k: v[k] if k in v else hl.missing(t) for k, t in merged_schema.items() }) return hl.case() \ .when(hl.is_missing(r_array), v_array.map(rewrite_var)) \ .when(hl.is_missing(v_array), r_array.map(rewrite_ref)) \ .default(hl.zip(r_array, v_array).map(lambda t: hl.coalesce(rewrite_var(t[1]), rewrite_ref(t[0]))))
def main(args): # Init Hail hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data # vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(), vep_csq_field=args.csq_field) if args.split_multi_allelic: # split multi-allelic variants mt = hl.split_multi_hts(mt) # split/annotate fields in the info field (use allele index ) mt = mt.annotate_rows(info=mt.info.annotate( **{field: mt.info[field][mt.a_index - 1] for field in INFO_FIELDS})) # parse/annotate the CSQ field in a different structure tb_csq = mt.rows() tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field])) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and # the values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts) # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present if all( [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]): tb_csq = (tb_csq.annotate(csq_raw=hl.cond( tb_csq.was_split, tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_raw))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript( ht=tb_csq, csq_array='csq_raw', ) # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( Consequence=tb_csq.vep.Consequence.split('&')[0]))) # Parse the protein DOMAIN field if 'DOMAINS' in vep_fields: tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate( DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS'])))) # drop redundant/temp fields tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500)) # print fields overview tb_csq.describe() # write table as HailTable to disk # (tb_csq # .write(output=args.tb_output_path, # overwrite=args.overwrite) # ) output_path = get_variant_qc_ht_path(part='vep_vqsr', split=args.split_multi_allelic) tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(f'{output_path}.tsv.bgz')) # Stop Hail hl.stop()
def _to_expr(e, dtype): if e is None: return None elif isinstance(e, Expression): if e.dtype != dtype: assert is_numeric(dtype), 'expected {}, got {}'.format( dtype, e.dtype) if dtype == tfloat64: return hl.float64(e) elif dtype == tfloat32: return hl.float32(e) elif dtype == tint64: return hl.int64(e) else: assert dtype == tint32 return hl.int32(e) return e elif not is_compound(dtype): # these are not container types and cannot contain expressions if we got here return e elif isinstance(dtype, tstruct): new_fields = [] found_expr = False for f, t in dtype.items(): value = _to_expr(e[f], t) found_expr = found_expr or isinstance(value, Expression) new_fields.append(value) if not found_expr: return e else: exprs = [ new_fields[i] if isinstance(new_fields[i], Expression) else hl.literal(new_fields[i], dtype[i]) for i in range(len(new_fields)) ] fields = {name: expr for name, expr in zip(dtype.keys(), exprs)} from .typed_expressions import StructExpression return StructExpression._from_fields(fields) elif isinstance(dtype, tarray): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert len(elements) > 0 exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.MakeArray([e._ir for e in exprs], None) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tset): elements = [] found_expr = False for element in e: value = _to_expr(element, dtype.element_type) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: assert len(elements) > 0 exprs = [ element if isinstance(element, Expression) else hl.literal( element, dtype.element_type) for element in elements ] indices, aggregations = unify_all(*exprs) x = ir.ToSet( ir.ToStream(ir.MakeArray([e._ir for e in exprs], None))) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, ttuple): elements = [] found_expr = False assert len(e) == len(dtype.types) for i in range(len(e)): value = _to_expr(e[i], dtype.types[i]) found_expr = found_expr or isinstance(value, Expression) elements.append(value) if not found_expr: return e else: exprs = [ elements[i] if isinstance(elements[i], Expression) else hl.literal(elements[i], dtype.types[i]) for i in range(len(elements)) ] indices, aggregations = unify_all(*exprs) x = ir.MakeTuple([expr._ir for expr in exprs]) return expressions.construct_expr(x, dtype, indices, aggregations) elif isinstance(dtype, tdict): keys = [] values = [] found_expr = False for k, v in e.items(): k_ = _to_expr(k, dtype.key_type) v_ = _to_expr(v, dtype.value_type) found_expr = found_expr or isinstance(k_, Expression) found_expr = found_expr or isinstance(v_, Expression) keys.append(k_) values.append(v_) if not found_expr: return e else: assert len(keys) > 0 # Here I use `to_expr` to call `lit` the keys and values separately. # I anticipate a common mode is statically-known keys and Expression # values. key_array = to_expr(keys, tarray(dtype.key_type)) value_array = to_expr(values, tarray(dtype.value_type)) return hl.dict(hl.zip(key_array, value_array)) elif isinstance(dtype, hl.tndarray): return hl.nd.array(e) else: raise NotImplementedError(dtype)
def _blanczos_pca(entry_expr, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ check_entry_indexed('mt_to_table_of_ndarray/entry_expr', entry_expr) mt = matrix_table_source('pca/entry_expr', entry_expr) A, ht = mt_to_table_of_ndarray(entry_expr, block_size, return_checkpointed_table_also=True) A = A.persist() # Set Parameters q = q_iterations L = k + oversampling_param n = A.take(1)[0].ndarray.shape[1] # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW U, S, V = hailBlanczos(A, G, k, q) scores = V.transpose() * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=list(mt.col_key)) lt = ht.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) if compute_loadings: return eigens, st, lt else: return eigens, st, None
def main(args): # Init Hail with hg38 genome build as default hl.init(default_reference=args.default_ref_genome) # Import VEPed VCF file as MatrixTable and get VCF file meta-data vcf_path = args.vcf_vep_path mt = hl.import_vcf(path=vcf_path, force_bgz=args.force_bgz) # getting annotated VEP fields names from VCF-header vep_fields = get_vep_fields(vcf_path=vcf_path, vep_csq_field=args.csq_field) if args.exclude_multi_allelic: # TODO: This option should skip the split_multi step... # Filter out multi-allelic variants. Keep only bi-allelic mt = filter_biallelic(mt) # split multi-allelic variants mt = hl.split_multi_hts(mt) # flatten nested structure (e.g. 'info') and get a HailTable with all rows fields tb_csq = (mt.rows().flatten().key_by('locus', 'alleles')) # rename info[CSQ] field to 'csq_array'. # Simpler field name are easier to parse later... tb_csq = (tb_csq.rename({'info.' + args.csq_field: 'csq_array'})) # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>. # The transcript(s) are represented as a dict<k,v>, the keys are the field names extracted from the VCF header, the # values are the current annotated values in the CSQ field. tb_csq = (tb_csq.annotate(csq_array=tb_csq.csq_array.map( lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]')))))) # Keep transcript(s) matching with the allele index. # It requires having the flag "ALLELE_NUM" annotated by VEP # Apply only were the alleles were split. # TODO: Handle exception when the flag "ALLELE_NUM" is not present tb_csq = (tb_csq.annotate(csq_array=hl.cond( tb_csq.was_split, tb_csq.csq_array.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq. a_index)), tb_csq.csq_array))) # select and annotate one transcript per variant based on pre-defined rules tb_csq = pick_transcript(ht=tb_csq, csq_array='csq_array') # Expand selected transcript (dict) annotations adding independent fields. tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx') # Parse the "Consequence" field. Keep only the more severe consequence. # Avoid the notation "consequence_1&consequence_2" tb_csq = (tb_csq.transmute(Consequence=tb_csq.Consequence.split('&')[0])) # print fields overview tb_csq.describe() # drop unnecessary fields tb_csq = (tb_csq.drop('csq_array', 'tx')) # write table as HailTable to disk (tb_csq.write(output=args.tb_output_path)) if args.write_to_file: # write table to disk as a BGZ-compressed TSV file (tb_csq.export(args.tb_output_path + '.tsv.bgz')) # Stop Hail hl.stop()
def _blanczos_pca(A, k=10, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128): r"""Run randomized principal component analysis approximation (PCA) on numeric columns derived from a matrix table. Implements the Blanczos algorithm found by Rokhlin, Szlam, and Tygert. Examples -------- For a matrix table with variant rows, sample columns, and genotype entries, compute the top 2 PC sample scores and eigenvalues of the matrix of 0s and 1s encoding missingness of genotype calls. >>> eigenvalues, scores, _ = hl._blanczos_pca(hl.int(hl.is_defined(dataset.GT)), ... k=2) Warning ------- This method does **not** automatically mean-center or normalize each column. If desired, such transformations should be incorporated in `entry_expr`. Hail will return an error if `entry_expr` evaluates to missing, nan, or infinity on any entry. Notes ----- PCA is run on the columns of the numeric matrix obtained by evaluating `entry_expr` on each entry of the matrix table, or equivalently on the rows of the **transposed** numeric matrix :math:`M` referenced below. PCA computes the SVD .. math:: M = USV^T where columns of :math:`U` are left singular vectors (orthonormal in :math:`\mathbb{R}^n`), columns of :math:`V` are right singular vectors (orthonormal in :math:`\mathbb{R}^m`), and :math:`S=\mathrm{diag}(s_1, s_2, \ldots)` with ordered singular values :math:`s_1 \ge s_2 \ge \cdots \ge 0`. Typically one computes only the first :math:`k` singular vectors and values, yielding the best rank :math:`k` approximation :math:`U_k S_k V_k^T` of :math:`M`; the truncations :math:`U_k`, :math:`S_k` and :math:`V_k` are :math:`n \times k`, :math:`k \times k` and :math:`m \times k` respectively. From the perspective of the rows of :math:`M` as samples (data points), :math:`V_k` contains the loadings for the first :math:`k` PCs while :math:`MV_k = U_k S_k` contains the first :math:`k` PC scores of each sample. The loadings represent a new basis of features while the scores represent the projected data on those features. The eigenvalues of the Gramian :math:`MM^T` are the squares of the singular values :math:`s_1^2, s_2^2, \ldots`, which represent the variances carried by the respective PCs. By default, Hail only computes the loadings if the ``loadings`` parameter is specified. Scores are stored in a :class:`.Table` with the column key of the matrix table as key and a field `scores` of type ``array<float64>`` containing the principal component scores. Loadings are stored in a :class:`.Table` with the row key of the matrix table as key and a field `loadings` of type ``array<float64>`` containing the principal component loadings. The eigenvalues are returned in descending order, with scores and loadings given the corresponding array order. Parameters ---------- entry_expr : :class:`.Expression` Numeric expression for matrix entries. k : :obj:`int` Number of principal components. compute_loadings : :obj:`bool` If ``True``, compute row loadings. q_iterations : :obj:`int` Number of rounds of power iteration to amplify singular values. oversampling_param : :obj:`int` Amount of oversampling to use when approximating the singular values. Usually a value between `0 <= oversampling_param <= k`. Returns ------- (:obj:`list` of :obj:`float`, :class:`.Table`, :class:`.Table`) List of eigenvalues, table with column scores, table with row loadings. """ if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_blanczos_pca/entry_expr', A) A = _make_tsm(A, block_size) U, S, V = _reduced_svd(A, k, compute_loadings, q_iterations, k + oversampling_param) scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) return eigens, st, lt else: return eigens, st, None
def _pca_and_moments(A, k=10, num_moments=5, compute_loadings=False, q_iterations=2, oversampling_param=2, block_size=128, moment_samples=100): if not isinstance(A, TallSkinnyMatrix): check_entry_indexed('_spectral_moments/entry_expr', A) A = _make_tsm_from_call(A, block_size) # Set Parameters q = q_iterations L = k + oversampling_param n = A.ncols # Generate random matrix G G = hl.nd.zeros((n, L)).map(lambda n: hl.rand_norm(0, 1)) G = hl.nd.qr(G)[0]._persist() fact = _krylov_factorization(A, G, q, compute_loadings) info("_reduced_svd: Computing local SVD") U, S, V = fact.reduced_svd(k) p = min(num_moments // 2, 10) # Generate random matrix G2 for moment estimation G2 = hl.nd.zeros( (n, moment_samples)).map(lambda n: hl.if_else(hl.rand_bool(0.5), -1, 1)) # Project out components in subspace fact.V, which we can compute exactly G2 = G2 - fact.V @ (fact.V.T @ G2) Q1, R1 = hl.nd.qr(G2)._persist() fact2 = _krylov_factorization(A, Q1, p, compute_U=False) moments_and_stdevs = fact2.spectral_moments(num_moments, R1) # Add back exact moments moments = moments_and_stdevs.moments + hl.nd.array([ fact.S.map(lambda x: x**(2 * i)).sum() for i in range(1, num_moments + 1) ]) moments_and_stdevs = hl.eval( hl.struct(moments=moments, stdevs=moments_and_stdevs.stdevs)) moments = moments_and_stdevs.moments stdevs = moments_and_stdevs.stdevs scores = V * S eigens = hl.eval(S * S) info("blanczos_pca: SVD Complete. Computing conversion to PCs.") hail_array_scores = scores._data_array() cols_and_scores = hl.zip( A.source_table.index_globals().cols, hail_array_scores).map(lambda tup: tup[0].annotate(scores=tup[1])) st = hl.Table.parallelize(cols_and_scores, key=A.col_key) if compute_loadings: lt = A.source_table.select() lt = lt.annotate_globals(U=U) idx_name = '_tmp_pca_loading_index' lt = lt.add_index(idx_name) lt = lt.annotate( loadings=lt.U[lt[idx_name], :]._data_array()).select_globals() lt = lt.drop(lt[idx_name]) else: lt = None return eigens, st, lt, moments, stdevs
def test_to_dense_mt(): vds = hl.vds.read_vds( os.path.join(resource('vds'), '1kg_2samples_starts.vds')) vds = hl.vds.filter_chromosomes(vds, keep='chr22') dense = hl.vds.to_dense_mt(vds).select_entries('LGT', 'LA', 'GQ', 'DP') assert dense.rows().select()._same(vds.variant_data.rows().select( )), "rows differ between variant data and dense mt" assert dense.filter_entries(hl.is_defined(dense.LA))._same( vds.variant_data.select_entries('LGT', 'LA', 'GQ', 'DP')), "cannot recover variant data" as_dict = dense.aggregate_entries( hl.dict( hl.zip(hl.agg.collect((hl.str(dense.locus), dense.s)), hl.agg.collect(dense.entry)))) assert as_dict.get(('chr22:10514784', 'NA12891')) == None assert as_dict.get( ('chr22:10514784', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=23, DP=4) assert as_dict.get( ('chr22:10516150', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=64, DP=4) assert as_dict.get( ('chr22:10516150', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=10) assert as_dict.get( ('chr22:10519088', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=21) assert as_dict.get(('chr22:10519088', 'NA12878')) == None assert as_dict.get( ('chr22:10562435', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=15) assert as_dict.get( ('chr22:10562435', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]), LA=None, GQ=21, DP=9) assert as_dict.get( ('chr22:10562436', 'NA12891')) == hl.Struct(LGT=hl.Call([0, 1]), LA=[0, 1], GQ=99, DP=15) assert as_dict.get( ('chr22:10562436', 'NA12878')) == hl.Struct(LGT=hl.Call([0, 0]), LA=None, GQ=21, DP=9)