def get_test_genotypes_bm(chrom, genotype_bm_path): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) # if chrom == 'all': mt = get_filtered_mt_with_x() else: mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', )) mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) # if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'): # samples = mt.s.take(10) # mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) # mt = mt.key_cols_by(userId=hl.int32(mt.s)) # mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True) # else: # samples_ht = hl.read_table(genotype_samples_ht_path) controls = hl.read_table(f'{scratch_dir}/genotype_samples_n10.ht') cases = hl.read_table(f'{scratch_dir}/genotype_samples_n10_cases.ht') samples_ht = cases.union(controls) mt = mt.filter_cols(hl.is_defined(samples_ht[hl.int32(mt.s)])) mt = mt.key_cols_by(userId=hl.int32(mt.s)) print(mt.count()) mt = mt.select_cols().select_rows() mt = mt.repartition(1000) BlockMatrix.write_from_entry_expr(mt.dosage, genotype_bm_path, overwrite=True)
def get_test_genotypes_mt(chrom, genotype_samples_ht_path, genotype_mt_path, cases_only): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) # if chrom == 'all': mt = get_filtered_mt_with_x() else: mt = get_filtered_mt(chrom=chrom, entry_fields=('dosage', )) if status == 'cases': t2d_ht = hl.read_table( f'gs://ukbb-diverse-temp-30day/nb-scratch/t2d.ht/') t2d_ht = t2d_ht.filter(t2d_ht.both_sexes == 1) t2d_ht = t2d_ht.key_by('userId') mt = mt.filter_cols(hl.is_defined(t2d_ht[hl.int32(mt.s)])) mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) if not hl.hadoop_is_file(f'{genotype_samples_ht_path}/_SUCCESS'): samples = mt.s.take(10) mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt.cols().add_index().write(genotype_samples_ht_path, overwrite=True) else: samples_ht = hl.read_table(genotype_samples_ht_path) samples = samples_ht.s.collect() mt = mt.filter_cols(hl.literal(samples).contains(mt.s)) mt = mt.key_cols_by(userId=hl.int32(mt.s)) mt = mt.select_entries('dosage') mt = mt.select_rows() mt = mt.select_cols() mt = mt.repartition(10) mt.write(genotype_mt_path)
def _linreg(y, x, nested_dim): k = len(x) k0 = nested_dim if k0 < 0 or k0 > k: raise ValueError( "linreg: `nested_dim` must be between 0 and the number " f"of covariates ({k}), inclusive") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), multiple_standard_error=hl.tfloat64, multiple_r_squared=hl.tfloat64, adjusted_r_squared=hl.tfloat64, f_stat=hl.tfloat64, multiple_p_value=hl.tfloat64, n=hl.tint64) x = hl.array(x) k = hl.int32(k) k0 = hl.int32(k0) return _agg_func('LinearRegression', _to_agg(y), t, [k, k0], seq_op_args=[lambda y: y, x])
def default_compute_info(mt: hl.MatrixTable, site_annotations: bool = False, n_partitions: int = 5000) -> hl.Table: """ Computes a HT with the typical GATK allele-specific (AS) info fields as well as ACs and lowqual fields. Note that this table doesn't split multi-allelic sites. :param mt: Input MatrixTable. Note that this table should be filtered to nonref sites. :param site_annotations: Whether to also generate site level info fields. Default is False. :param n_partitions: Number of desired partitions for output Table. Default is 5000. :return: Table with info fields :rtype: Table """ # Move gvcf info entries out from nested struct mt = mt.transmute_entries(**mt.gvcf_info) # Compute AS info expr info_expr = get_as_info_expr(mt) if site_annotations: info_expr = info_expr.annotate(**get_site_info_expr(mt)) # Add AC and AC_raw: # First compute ACs for each non-ref allele, grouped by adj grp_ac_expr = hl.agg.array_agg( lambda ai: hl.agg.filter( mt.LA.contains(ai), hl.agg.group_by( get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD), hl.agg.sum( mt.LGT.one_hot_alleles(mt.LA.map(lambda x: hl.str(x)))[ mt.LA.index(ai)]), ), ), hl.range(1, hl.len(mt.alleles)), ) # Then, for each non-ref allele, compute # AC as the adj group # AC_raw as the sum of adj and non-adj groups info_expr = info_expr.annotate( AC_raw=grp_ac_expr.map( lambda i: hl.int32(i.get(True, 0) + i.get(False, 0))), AC=grp_ac_expr.map(lambda i: hl.int32(i.get(True, 0))), ) info_ht = mt.select_rows(info=info_expr).rows() # Add AS lowqual flag info_ht = info_ht.annotate(AS_lowqual=get_lowqual_expr( info_ht.alleles, info_ht.info.AS_QUALapprox)) if site_annotations: # Add lowqual flag info_ht = info_ht.annotate( lowqual=get_lowqual_expr(info_ht.alleles, info_ht.info.QUALapprox)) return info_ht.naive_coalesce(n_partitions)
def get_cpx_interval(x): # an example format of CPX_INTERVALS is "DUP_chr1:1499897-1499974" type_chr = x.split('_chr') chr_pos = type_chr[1].split(':') pos = chr_pos[1].split('-') return hl.struct(type=type_chr[0], chrom=chr_pos[0], start=hl.int32(pos[0]), end=hl.int32(pos[1]))
def ascertainment_bias(mt, y, P): """Adds ascertainment bias to a binary phenotype such that it was sample prevalence of `P` = cases/(cases+controls). Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` containing binary phenotype to be used. y : :class:`.Expression` Column field of binary phenotype. P : :obj:`int` or :obj:`float` Desired "sample prevalence" of phenotype. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` containing binary phenotype with prevalence of approx. P """ assert P >= 0 and P <= 1, 'P must be in [0,1]' tid = ''.join( random.choices(string.ascii_uppercase + string.ascii_lowercase, k=5) ) # "temporary id" -- random string to identify temporary intermediate fields generated by this method mt = mt.annotate_cols(y_w_asc_bias=y) y_stats = mt.aggregate_cols(hl.agg.stats(mt.y_w_asc_bias)) K = y_stats.mean n = y_stats.n assert abs( P - K ) < 1, 'Specified sample prevalence is incompatible with population prevalence.' if P < K: p = (1 - K) * P / (K * (1 - P)) con = mt.filter_cols(mt.y_w_asc_bias == 0) cas = mt.filter_cols(mt.y_w_asc_bias == 1).add_col_index( name='col_idx_' + tid) keep = round(p * n * K) * [1] + round((1 - p) * n * K) * [0] cas = cas.annotate_cols( ** {'keep_' + tid: hl.literal(keep)[hl.int32(cas['col_idx_' + tid])]}) cas = cas.filter_cols(cas['keep_' + tid] == 1) cas = _clean_fields(cas, tid) mt = cas.union_cols(con) elif P > K: p = K * (1 - P) / ((1 - K) * P) cas = mt.filter_cols(mt.y_w_asc_bias == 1) con = mt.filter_cols(mt.y_w_asc_bias == 0).add_col_index( name='col_idx_' + tid) keep = round(p * n * (1 - K)) * [1] + round( (1 - p) * n * (1 - K)) * [0] con = con.annotate_cols( ** {'keep_' + tid: hl.literal(keep)[hl.int32(con['col_idx_' + tid])]}) con = con.filter_cols(con['keep_' + tid] == 1) con = _clean_fields(con, tid) mt = con.union_cols(cas) return mt
def ascertainment_bias(mt, y, P): r"""Adds ascertainment bias to a binary phenotype to give it a sample prevalence of `P` = cases/(cases+controls). Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` containing binary phenotype to be used. y : :class:`.Expression` Column field of binary phenotype. P : :obj:`int` or :obj:`float` Desired "sample prevalence" of phenotype. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` containing binary phenotype with prevalence of approx. P """ assert P >= 0 and P <= 1, 'P must be in [0,1]' uid = Env.get_uid(base=100) mt = mt.annotate_cols(y_w_asc_bias=y) y_stats = mt.aggregate_cols(hl.agg.stats(mt.y_w_asc_bias)) K = y_stats.mean n = y_stats.n assert abs( P - K ) < 1, 'Specified sample prevalence is incompatible with population prevalence.' if P < K: p = (1 - K) * P / (K * (1 - P)) con = mt.filter_cols(mt.y_w_asc_bias == 0) cas = mt.filter_cols(mt.y_w_asc_bias == 1).add_col_index( name='col_idx_' + uid) keep = round(p * n * K) * [1] + round((1 - p) * n * K) * [0] cas = cas.annotate_cols( ** {'keep_' + uid: hl.literal(keep)[hl.int32(cas['col_idx_' + uid])]}) cas = cas.filter_cols(cas['keep_' + uid] == 1) cas = _clean_fields(cas, uid) mt = cas.union_cols(con) elif P > K: p = K * (1 - P) / ((1 - K) * P) cas = mt.filter_cols(mt.y_w_asc_bias == 1) con = mt.filter_cols(mt.y_w_asc_bias == 0).add_col_index( name='col_idx_' + uid) keep = round(p * n * (1 - K)) * [1] + round( (1 - p) * n * (1 - K)) * [0] con = con.annotate_cols( ** {'keep_' + uid: hl.literal(keep)[hl.int32(con['col_idx_' + uid])]}) con = con.filter_cols(con['keep_' + uid] == 1) con = _clean_fields(con, uid) mt = con.union_cols(cas) return mt
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus=hl.locus('20', mt.row_idx + 1), alleles=['A', 'G']).key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s=hl.str(mt.col_idx)).key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a=hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b=hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP=hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def full(shape, value, dtype=None): """Creates a hail :class:`.NDArrayNumericExpression` full of the specified value. Examples -------- Create a 5 by 7 NDArray of type :py:data:`.tfloat64` 9s. >>> hl.nd.full((5, 7), 9) It is possible to specify a type other than :py:data:`.tfloat64` with the `dtype` argument. >>> hl.nd.full((5, 7), 9, dtype=hl.tint32) Parameters ---------- shape : `tuple` or :class:`.TupleExpression` Desired shape. value : :class:`.Expression` or python value Value to fill ndarray with. dtype : :class:`.HailType` Desired hail type. Returns ------- :class:`.NDArrayNumericExpression` An ndarray of the specified shape filled with the specified value. """ if isinstance(shape, Int64Expression): shape_product = shape else: shape_product = reduce(lambda a, b: a * b, shape) return arange(hl.int32(shape_product)).map( lambda x: cast_expr(value, dtype)).reshape(shape)
def full(shape, value): if isinstance(shape, Int64Expression): shape_product = shape else: shape_product = reduce(lambda a, b: a * b, shape) return array(hl.range( hl.int32(shape_product)).map(lambda x: value)).reshape(shape)
def full(shape, value, dtype=None): if isinstance(shape, Int64Expression): shape_product = shape else: shape_product = reduce(lambda a, b: a * b, shape) return arange(hl.int32(shape_product)).map( lambda x: cast_expr(value, dtype)).reshape(shape)
def make_corr_betas(mt, h2=None, rg=None, cov_array=None, seed=None): '''Make correlated betas for multi-trait simulations''' seed = seed if seed is not None else int.from_bytes(os.urandom(4), byteorder="big") # assert () M = mt.count_rows() if cov_array != None: n_phens = cov_array.shape[0] else: n_phens = len(h2) if rg is None and cov_array is None: print(f'Assuming rg=0 for all {n_phens} traits') rg = [0] * int((n_phens**2 - n_phens) / 2) if cov_array is None: cov_array = create_cov_array(h2, rg) cov_array = (1 / M) * cov_array randstate = np.random.RandomState( int(seed)) #seed random state for replicability betas = randstate.multivariate_normal(mean=np.zeros(n_phens), cov=cov_array, size=[ M, ]) df = pd.DataFrame([0] * M, columns=['__beta']) tb = hl.Table.from_pandas(df) tb = tb.add_index().key_by('idx') tb = tb.annotate(__beta=hl.literal(betas.tolist())[hl.int32(tb.idx)]) mt = mt.add_row_index() mt = mt.annotate_rows(__beta=tb[mt.row_idx]['__beta']) return mt, betas
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1), alleles = ['A', 'G']) .key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s = hl.str(mt.col_idx)) .key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def test_import_bgen_variant_filtering(self): desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198] actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, n_partitions=10, _row_fields=['file_row_idx'], _variants_per_file={ resource('example.8bits.bgen'): desired_variant_indexes }) # doing the expected import_bgen second catches the case where the # hadoop configuraiton is polluted with old data from the # _variants_per_file everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'], contig_recoding={'01': '1'}, reference_genome=None, _row_fields=['file_row_idx']) self.assertEqual(everything.count(), (199, 500)) expected = everything.filter_rows( hl.set(desired_variant_indexes).contains( hl.int32(everything.file_row_idx))) self.assertTrue(expected._same(actual)) self.assertEqual( (hl.str(actual.locus.contig) + ":" + hl.str(actual.locus.position)).collect(), [ '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000', '1:13000', '1:15000', '1:19000', '1:100001' ])
def multitrait_ss(mt, h2, pi, rg=0, seed=None): """Generates spike & slab betas for simulation of two correlated phenotypes. Parameters ---------- mt : :class:`.MatrixTable` :class:`.MatrixTable` for simulated phenotype. h2 : :obj:`list` Desired SNP-based heritability of simulated traits. pi : :obj:`list` List of proportion of SNPs: :math:`p_{TT}`, :math:`p_{TF}`, :math:`p_{FT}` :math:`p_{TT}` is the proportion of SNPs that are causal for both traits, :math:`p_{TF}` is the proportion of SNPs that are causal for trait 1 but not trait 2, :math:`p_{FT}` is the proportion of SNPs that are causal for trait 2 but not trait 1. :math:`p_{FF}` is the remaining proportion of SNPs and is the proportion of SNPs that are not causal for both traits. rg : :obj:`float` or :obj:`int` Genetic correlation between traits. seed : :obj:`int`, optional Seed for random number generator. If `seed` is ``None``, `seed` is set randomly. Warning ------- May give inaccurate results if chosen parameters make the covariance matrix not positive semi-definite. Returns ------- :class:`.MatrixTable` :class:`.MatrixTable` with simulated SNP effects as a row field of arrays. """ seed = seed if seed is not None else int(str(Env.next_seed())[:8]) ptt, ptf, pft, pff = pi[0], pi[1], pi[2], 1 - sum(pi) cov_matrix = np.asarray([[1 / (ptt + ptf), rg / ptt], [rg / ptt, 1 / (ptt + pft)]]) M = mt.count_cols() randstate = np.random.RandomState( int(seed)) #seed random state for replicability beta = randstate.multivariate_normal(mean=np.zeros(2), cov=cov_matrix, size=[ int(M), ]) zeros = np.zeros(shape=int(M)).T beta_matrix = np.stack( (np.asarray([zeros, zeros]).T, np.asarray( [zeros, beta[:, 1]]).T, np.asarray([beta[:, 0], zeros]).T, beta), axis=1) idx = np.random.choice([0, 1, 2, 3], p=[pff, pft, ptf, ptt], size=int(M)) betas = beta_matrix[range(int(M)), idx, :] betas[:, 0] *= (h2[0] / M)**(1 / 2) betas[:, 1] *= (h2[1] / M)**(1 / 2) df = pd.DataFrame([0] * M, columns=['beta']) tb = hl.Table.from_pandas(df) tb = tb.add_index().key_by('idx') tb = tb.annotate(beta=hl.literal(betas.tolist())[hl.int32(tb.idx)]) mt = mt.add_row_index() mt = mt.annotate_rows(beta=tb[mt.row_idx]['beta']) return mt
def eye(N, M=None, dtype=hl.tfloat64): """ Construct a 2-D :class:`.NDArrayExpression` with ones on the *main* diagonal and zeros elsewhere. Parameters ---------- N : :class:`.NumericExpression` or Python number Number of rows in the output. M : :class:`.NumericExpression` or Python number, optional Number of columns in the output. If None, defaults to `N`. dtype : numeric :class:`.HailType`, optional Element type of the returned array. Defaults to :py:data:`.tfloat64` Returns ------- I : :class:`.NDArrayExpression` representing a Hail ndarray of shape (N,M) An ndarray whose elements are equal to one on the main diagonal, zeroes elsewhere. See Also -------- :func:`.identity` :func:`.diagonal` Examples -------- >>> hl.eval(hl.nd.eye(3)) array([[1., 0., 0.], [0., 1., 0.], [0., 0., 1.]]) >>> hl.eval(hl.nd.eye(2, 5, dtype=hl.tint32)) array([[1, 0, 0, 0, 0], [0, 1, 0, 0, 0]], dtype=int32) """ n_row = hl.int32(N) if M is None: n_col = n_row else: n_col = hl.int32(M) return hl.nd.array(hl.range(0, n_row * n_col).map( lambda i: hl.if_else((i // n_col) == (i % n_col), hl.literal(1, dtype), hl.literal(0, dtype)) )).reshape((n_row, n_col))
def _promote_scalar(self, typ): if typ == tint32: return hail.int32(self) elif typ == tint64: return hail.int64(self) elif typ == tfloat32: return hail.float32(self) else: assert typ == tfloat64 return hail.float64(self)
def compute_fisher_exact(tb: hl.Table, n_cases_col: str, n_control_col: str, total_cases_col: str, total_controls_col: str, correct_total_counts: bool, root_col_name: str, extra_fields: dict) -> hl.Table: """ Perform two-sided Fisher Exact test. Add extra annotations (if any) :param tb: Hail Table :param n_cases_col: field name with number of (affected) cases :param n_control_col: field name with number of (affected) control :param total_cases_col: field name with total number of cases :param total_controls_col: field name with total number of controls :param correct_total_counts: should the total numbers (case/control) be corrected to avoid duplicated counting? :param root_col_name: field to be annotated with test results :param extra_fields: Extra filed (must be a dict) to be annotated :return: Hail Table with Fisher Exact test results. """ # compute fisher exact if correct_total_counts: fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]), c2=hl.int32(tb[n_control_col]), c3=hl.int32(tb[total_cases_col]) - hl.int32(tb[n_cases_col]), c4=hl.int32(tb[total_controls_col]) - hl.int32(tb[n_control_col])) else: fet = hl.fisher_exact_test(c1=hl.int32(tb[n_cases_col]), c2=hl.int32(tb[n_control_col]), c3=hl.int32(tb[total_cases_col]), c4=hl.int32(tb[total_controls_col])) tb = (tb .annotate(**{root_col_name: fet}) .flatten() ) if len(extra_fields) == 0: return tb else: return tb.annotate(**extra_fields)
def test_ndarray_full(): assert_ndarrays_eq((hl.nd.zeros(4), np.zeros(4)), (hl.nd.zeros( (3, 4, 5)), np.zeros((3, 4, 5))), (hl.nd.ones(6), np.ones(6)), (hl.nd.ones((6, 6, 6)), np.ones((6, 6, 6))), (hl.nd.full(7, 9), np.full(7, 9)), (hl.nd.full( (3, 4, 5), 9), np.full((3, 4, 5), 9))) assert hl.eval(hl.nd.zeros((5, 5), dtype=hl.tfloat32)).dtype, np.float32 assert hl.eval(hl.nd.ones(3, dtype=hl.tint64)).dtype, np.int64 assert hl.eval(hl.nd.full((5, 6, 7), hl.int32(3), dtype=hl.tfloat64)).dtype, np.float64
def parse_as_ranksum(string, has_non_ref): typ = hl.ttuple(hl.tfloat64, hl.tint32) items = string.split(r'\|') items = hl.cond(has_non_ref, items[:-1], items) return items.map(lambda s: hl.cond( (hl.len(s) == 0) | (s == '.'), hl.null(typ), hl.rbind(s.split(','), lambda ss: hl.cond( hl.len(ss) != 2, # bad field, possibly 'NaN', just set it null hl.null(hl.ttuple(hl.tfloat64, hl.tint32)), hl.tuple([hl.float64(ss[0]), hl.int32(ss[1])])))))
def make_random_function(self, mt): from functools import reduce #check that row key of annotations matches row key of mt mt = mt.add_row_index() rows = [rf for rf in self.a_ht.row] self.a_ht = self.a_ht.annotate(__a__=reduce( self.f, map(lambda x: self.a_ht[rows[x]], range(len(rows))))) std = self.a_ht.aggregate(hl.agg.stats(self.a_ht.__a__)).stdev self.a_ht = self.a_ht.annotate(__a__=self.a_ht.__a__ * hl.sqrt(self.h2 / std)) return mt.annotate_rows(beta=hl.literal( self.a_ht.__a__.take(mt.count_rows()))[hl.int32(mt.row_idx)])
def _linreg(y, x, nested_dim): k = len(x) k0 = nested_dim if k0 < 0 or k0 > k: raise ValueError("linreg: `nested_dim` must be between 0 and the number " f"of covariates ({k}), inclusive") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), multiple_standard_error=hl.tfloat64, multiple_r_squared=hl.tfloat64, adjusted_r_squared=hl.tfloat64, f_stat=hl.tfloat64, multiple_p_value=hl.tfloat64, n=hl.tint64) x = hl.array(x) k = hl.int32(k) k0 = hl.int32(k0) return _agg_func('LinearRegression', [y, x], t, [k, k0])
def diagonal(nd): """Gets the diagonal of a 2 dimensional NDArray. Examples -------- >>> hl.eval(hl.nd.diagonal(hl.nd.array([[1, 2], [3, 4]]))) array([1, 4], dtype=int32) :param nd: A 2 dimensional NDArray, shape(M, N). :return: A 1 dimension NDArray of length min (M, N), containing the diagonal of `nd`. """ assert nd.ndim == 2, "diagonal requires 2 dimensional ndarray" shape_min = hl.min(nd.shape[0], nd.shape[1]) return hl.nd.array(hl.range(hl.int32(shape_min)).map(lambda i: nd[i, i]))
def get_phased_gnomad_ht(ht: hl.Table, em: bool = True, lr: bool = True, shr: bool = True) -> hl.Table: expr_fun = [] if em: expr_fun.append(get_em_expressions) if lr: expr_fun.append(get_lr_expressions) if shr: expr_fun.append(get_single_het_expressions) if not expr_fun: raise (Exception("No expressions to annotate")) # Support for both exploded or dict versions of gt_counts # dict if isinstance(ht.gt_counts, hl.expr.DictExpression): ht = ht.select( phase_info=ht.gt_counts.map_values(lambda pop_count: hl.bind( lambda x: hl.struct( gt_counts=x, **{k: v for f in expr_fun for k, v in f(x).items()}), hl.struct(raw=pop_count.raw.map(lambda y: hl.int32(y)), adj=pop_count.adj.map(lambda z: hl.int32(z)))))) # exploded else: ht = ht.annotate( **{k: v for f in expr_fun for k, v in f(ht.gt_counts).items()}) return ht
def _get(self, var_df, samples, field): # array with samples in HAIL if type(samples) is str: samples = [samples] # create table with vars in HAIL ht = hl.Table.from_pandas(var_df) # create table with samples df = pd.DataFrame({'s': samples}) ht_samples = hl.Table.from_pandas(df) ht = ht.join(ht_samples) ht = ht.annotate(pos=hl.int32(ht.pos)) ht = ht.add_index() ht = ht.key_by(locus=hl.struct(contig=ht.chrom, position=ht.pos), alleles=hl.array([ht.ref, ht.alt]), s=ht.s) # all variants per sample res_table = None ht_paths = self._get_Tables_paths(samples) # iterate through ht_vcfs with samples for ht_path in ht_paths: ht_vcf = hl.read_table(ht_path) ht_n = ht.join(ht_vcf, how='left') if res_table is None: res_table = ht_n res_table = res_table.checkpoint('db/checkpoint/ht1.ht', overwrite=True) else: res_table = res_table.union(ht_n) # all variants per sample res_table = res_table.annotate(GT=hl.coalesce(res_table.GT, 0)) res_table = res_table.annotate(DP=hl.coalesce(res_table.DP, 0)) res_table = res_table.annotate(GQ=hl.coalesce(res_table.GQ, 0)) res_table = res_table.order_by(res_table.idx) res_table = res_table.checkpoint('db/checkpoint/ht2.ht', overwrite=True) return np.column_stack([ np.array(res_table.filter( res_table.s == sample)[field].collect()).reshape(-1, 1) for sample in samples ])
def get_phen_files(nsamples, min_id, max_id, parsplit, paridx): nsamples = str(int(nsamples / 1000)) print( f'\r#########\nGetting phen files for {nsamples}k samples\n#########') mt0 = hl.read_matrix_table('gs://nbaya/ldscsim/hm3.50_sim_h2_0.08.mt/') ht0 = mt0.select_cols(mt0.nonsim_phen).cols() ht1 = ht0.rename({'s': 'IID', 'nonsim_phen': 'y'}) ht1 = ht1.annotate(FID='0') ht1 = ht1.key_by(ht1.FID) ht1 = ht1.select(ht1.IID, ht1.y) ht1 = ht1.key_by(ht1.IID) ids = hl.import_table(f'gs://nbaya/split/gcta/gcta_{nsamples}k.grm.id', no_header=True) #GRM ids ids = ids.rename({'f0': 'FID', 'f1': 'IID'}) ids = set(ids.IID.take(ids.count())) ht2 = ht1.filter(hl.literal(ids).contains(ht1['IID'])) n = ht2.count() rep_ids = range( min_id + paridx - 1, max_id + 1, parsplit ) #replicate "IDs", which were used as seeds to generate the random split for rep_id in rep_ids: try: is_complete = subprocess.check_output([ 'gsutil', 'ls', f'gs://nbaya/split/gcta/gcta_{nsamples}k.s{rep_id}.phen' ]) != None except: is_complete = False if not is_complete: start = datetime.now() pi = [1] * int(n / 2) + [0] * int(n / 2) randstate = np.random.RandomState(rep_id) randstate.shuffle(pi) ht = ht2.add_index() ht = ht.annotate(label=hl.literal(pi)[hl.int32(ht.idx)]) ht = ht.annotate(y1=hl.cond(ht.label == 1, ht.y, hl.null('float'))) ht = ht.annotate(y2=hl.cond(ht.label == 0, ht.y, hl.null('float'))) ht = ht.drop(ht.idx, ht.label, ht.y) ht = ht.order_by(ht.y1) ht.show() ht.export(f'gs://nbaya/split/gcta/gcta_{nsamples}k.s{rep_id}.phen') runtime = datetime.now() - start print( f'######\nRuntime for generating phenfile of rep {rep_id}: {round((runtime.total_seconds())/60, 4)} min' ) else: print(f'###### Already completed phenfile for replicate #{rep_id}')
def unify_saige_ht_variant_schema(ht): shared = ('markerID', 'AC', 'AF', 'N', 'BETA', 'SE', 'Tstat', 'varT', 'varTstar') new_floats = ('AF.Cases', 'AF.Controls') new_ints = ('N.Cases', 'N.Controls') shared_end = ('Pvalue', 'gene', 'annotation') if 'AF.Cases' not in list(ht.row): ht = ht.select(*shared, **{field: hl.null(hl.tfloat64) for field in new_floats}, **{field: hl.null(hl.tint32) for field in new_ints}, **{field: ht[field] for field in shared_end}) else: ht = ht.select(*shared, *new_floats, *new_ints, *shared_end) return ht.annotate(SE=hl.float64(ht.SE), AC=hl.int32(ht.AC))
def get_coverage_expr(mt): cov_arrays = hl.literal({ x: [1, 1, 1, 1, 1, 1, 1, 1, 0] if x >= 50 else [1, 1, 1, 1, 1, 1, 1, 0, 0] if x >= 30 else ([1] * (i + 2)) + ([0] * (7 - i)) for i, x in enumerate(range(5, 100, 5)) }) return hl.bind( lambda array_expr: hl.struct( **{ f'over_{x}': hl.int32(array_expr[i]) for i, x in enumerate([1, 5, 10, 15, 20, 25, 30, 50, 100]) }), hl.agg.array_sum(hl.case().when( mt.x >= 100, [1, 1, 1, 1, 1, 1, 1, 1, 1]).when( mt.x >= 5, cov_arrays[mt.x - (mt.x % 5)]).when( mt.x >= 1, [1, 0, 0, 0, 0, 0, 0, 0, 0]).default( [0, 0, 0, 0, 0, 0, 0, 0, 0])))
def hailBlanczos(A, G, k, q): h_list = [] G_i = hl.nd.qr(G)[0] for j in range(0, q): info(f"blanczos_pca: Beginning iteration {j + 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) temp = temp.annotate(G_i_intermediate=temp.ndarray.T @ temp.H_i) result = temp.aggregate(hl.struct( Hi_chunks=hl.agg.collect(temp.H_i), G_i=hl.agg.ndarray_sum(temp.G_i_intermediate)), _localize=False)._persist() localized_H_i = hl.nd.vstack(result.Hi_chunks) h_list.append(localized_H_i) G_i = hl.nd.qr(result.G_i)[0] info(f"blanczos_pca: Beginning iteration {q+ 1}/{q+1}") temp = A.annotate(H_i=A.ndarray @ G_i) result = temp.aggregate(hl.agg.collect(temp.H_i), _localize=False)._persist() info("blanczos_pca: Iterations complete. Computing local QR") localized_H_i = hl.nd.vstack(result) h_list.append(localized_H_i) H = hl.nd.hstack(h_list) Q = hl.nd.qr(H)[0]._persist() A = A.annotate(part_size=A.ndarray.shape[0]) A = A.annotate(rows_preceeding=hl.int32(hl.scan.sum(A.part_size))) A = A.annotate_globals(Qt=Q.T) T = A.annotate(ndarray=A.Qt[:, A.rows_preceeding:A.rows_preceeding + A.part_size] @ A.ndarray) arr_T = T.aggregate(hl.agg.ndarray_sum(T.ndarray), _localize=False) info("blanczos_pca: QR Complete. Computing local SVD") U, S, W = hl.nd.svd(arr_T, full_matrices=False)._persist() V = Q @ U truncV = V[:, :k] truncS = S[:k] truncW = W[:k, :] return truncV, truncS, truncW
def count(expr=None): """Count the number of records. Examples -------- Group by the `SEX` field and count the number of rows in each category: .. doctest:: >>> (table1.group_by(table1.SEX) ... .aggregate(n=agg.count()) ... .show()) +-----+-------+ | SEX | n | +-----+-------+ | str | int64 | +-----+-------+ | M | 2 | | F | 2 | +-----+-------+ Notes ----- If `expr` is not provided, then this method will count the number of records aggregated. If `expr` is provided, then the result should make use of :meth:`filter` or :meth:`explode` so that the number of records aggregated changes. Parameters ---------- expr : :class:`.Expression`, or :obj:`None` Expression to count. Returns ------- :class:`.Expression` of type :py:data:`.tint64` Total number of records. """ if expr is not None: return _agg_func('count', expr, tint64) else: return _agg_func('count', _to_agg(hl.int32(0)), tint64)
def fet_expr(het_count_exp: hl.expr.Int64Expression, hom_count_expr: hl.expr.Int64Expression): return hl.bind( lambda x: hl.struct( counts=x, dominant=hl.fisher_exact_test(x[0][0], x[0][1] + x[0][2], x[1][0], x[1][1] + x[1][2]), recessive=hl.fisher_exact_test(x[0][0] + x[0][1], x[0][ 2], x[1][0] + x[1][1], x[1][2])), hl.bind( lambda x: [ [ hl.int32( hl.cond(x.contains(False), x[False].get(0, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(1, 0), 0)), hl.int32( hl.cond(x.contains(False), x[False].get(2, 0), 0)) ], [ hl.int32( hl.cond(x.contains(True), x[True].get(0, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(1, 0), 0) ), hl.int32( hl.cond(x.contains(True), x[True].get(2, 0), 0) ) ], ], hl.agg.group_by( mt.is_case, hl.agg.counter( hl.min(2, het_count_exp + 2 * hom_count_expr)))))
def compute_coverage_stats( mt: hl.MatrixTable, reference_ht: hl.Table, coverage_over_x_bins: List[int] = [1, 5, 10, 15, 20, 25, 30, 50, 100], ) -> hl.Table: """ Computes the following coverage statistics for every base of the `reference_ht` provided: - mean - median - total DP - fraction of samples with coverage above X, for each x in `coverage_over_x_bins` The `reference_ht` is a table that contains row for each locus coverage should be computed on. It needs to be keyed with the same keys as `mt`, typically either `locus` or `locus, alleles`. The `reference_ht` can e.g. be created using `get_reference_ht` :param mt: Input sparse MT :param reference_ht: Input reference HT :param coverage_over_x_bins: List of boundaries for computing samples over X :return: Table with per-base coverage stats """ n_samples = mt.count_cols() print(f"Computing coverage stats on {n_samples} samples.") # Create an outer join with the reference Table mt = mt.select_entries("END", "DP").select_cols().select_rows() col_key_fields = list(mt.col_key) t = mt._localize_entries("__entries", "__cols") t = t.join(reference_ht.key_by(*mt.row_key).select(_in_ref=True), how="outer") t = t.annotate( __entries=hl.or_else( t.__entries, hl.range(n_samples).map(lambda x: hl.null(t.__entries.dtype.element_type)), ) ) mt = t._unlocalize_entries("__entries", "__cols", col_key_fields) # Densify mt = hl.experimental.densify(mt) # Filter rows where the reference is missing mt = mt.filter_rows(mt._in_ref) # Unfilter entries so that entries with no ref block overlap aren't null mt = mt.unfilter_entries() # Compute coverage stats coverage_over_x_bins = sorted(coverage_over_x_bins) max_coverage_bin = coverage_over_x_bins[-1] hl_coverage_over_x_bins = hl.array(coverage_over_x_bins) # This expression creates a counter DP -> number of samples for DP between 0 and max_coverage_bin coverage_counter_expr = hl.agg.counter( hl.min(max_coverage_bin, hl.or_else(mt.DP, 0)) ) # This expression aggregates the DP counter in reverse order of the coverage_over_x_bins # and computes the cumulative sum over them. # It needs to be in reverse order because we want the sum over samples covered by > X. count_array_expr = hl.cumulative_sum( hl.array( [ hl.int32(coverage_counter_expr.get(max_coverage_bin, 0)) ] # The coverage was already floored to the max_coverage_bin, so no more aggregation is needed for the max bin ).extend( # For each of the other bins, coverage needs to be summed between the boundaries hl.range(hl.len(hl_coverage_over_x_bins) - 1, 0, step=-1).map( lambda i: hl.sum( hl.range( hl_coverage_over_x_bins[i - 1], hl_coverage_over_x_bins[i] ).map(lambda j: hl.int32(coverage_counter_expr.get(j, 0))) ) ) ) ) mean_expr = hl.agg.mean(hl.or_else(mt.DP, 0)) # Annotate rows now return mt.select_rows( mean=hl.cond(hl.is_nan(mean_expr), 0, mean_expr), median_approx=hl.or_else(hl.agg.approx_median(hl.or_else(mt.DP, 0)), 0), total_DP=hl.agg.sum(mt.DP), **{ f"over_{x}": count_array_expr[i] / n_samples for i, x in zip( range( len(coverage_over_x_bins) - 1, -1, -1 ), # Reverse the bin index as count_array_expr has the reverse order coverage_over_x_bins, ) }, ).rows()
def test_locus_windows(self): def assert_eq(a, b): self.assertTrue(np.array_equal(a, np.array(b))) centimorgans = hl.literal([0.1, 1.0, 1.0, 1.5, 1.9]) mt = hl.balding_nichols_model(1, 5, 5).add_row_index() mt = mt.annotate_rows(cm=centimorgans[hl.int32(mt.row_idx)]).cache() starts, stops = hl.linalg.utils.locus_windows(mt.locus, 2) assert_eq(starts, [0, 0, 0, 1, 2]) assert_eq(stops, [3, 4, 5, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 0.5, coord_expr=mt.cm) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) starts, stops = hl.linalg.utils.locus_windows(mt.locus, 1.0, coord_expr=2 * centimorgans[hl.int32(mt.row_idx)]) assert_eq(starts, [0, 1, 1, 1, 3]) assert_eq(stops, [1, 4, 4, 5, 5]) rows = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, {'locus': hl.Locus('1', 2), 'cm': 3.0}, {'locus': hl.Locus('1', 4), 'cm': 4.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('2', 1), 'cm': 2.0}, {'locus': hl.Locus('3', 3), 'cm': 5.0}] ht = hl.Table.parallelize(rows, hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1) assert_eq(starts, [0, 0, 2, 3, 3, 5]) assert_eq(stops, [2, 2, 3, 5, 5, 6]) starts, stops = hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) assert_eq(starts, [0, 1, 1, 3, 3, 5]) assert_eq(stops, [1, 3, 3, 5, 5, 6]) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.order_by(ht.cm).locus, 1.0) self.assertTrue('ascending order' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=hl.utils.range_table(1).idx) self.assertTrue('different source' in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(hl.locus('1', 1), 1.0) self.assertTrue("no source" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=0.0) self.assertTrue("no source" in str(cm.exception)) ht = ht.annotate_globals(x = hl.locus('1', 1), y = 1.0) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.x, 1.0) self.assertTrue("row-indexed" in str(cm.exception)) with self.assertRaises(ExpressionException) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, ht.y) self.assertTrue("row-indexed" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.null(hl.tlocus()), 'cm': 1.0}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'locus_expr'" in str(cm.exception)) ht = hl.Table.parallelize([{'locus': hl.Locus('1', 1), 'cm': hl.null(hl.tfloat64)}], hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), key=['locus']) with self.assertRaises(ValueError) as cm: hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) self.assertTrue("missing value for 'coord_expr'" in str(cm.exception))