def gather(ht, key, value, *fields) -> Table: """Collapse fields into key-value pairs. :func:`.gather` mimics the functionality of the `gather()` function found in R's ``tidyr`` package. This is a way to turn "wide" format data into "long" format data. Parameters ---------- ht : :class:`.Table` A Hail table. key : :obj:`str` The name of the key field in the gathered table. value : :obj:`str` The name of the value field in the gathered table. fields : variable-length args of obj:`str` Names of fields to gather in ``ht``. Returns ------- :class:`.Table` Table with original ``fields`` gathered into ``key`` and ``value`` fields.""" ht = ht.annotate( _col_val=hl.array([hl.array([field, ht[field]]) for field in fields])) ht = ht.drop(*fields) ht = ht.explode(ht['_col_val']) ht = ht.annotate(**{key: ht['_col_val'][0], value: ht['_col_val'][1]}) ht = ht.drop('_col_val') ht_tmp = new_temp_file() ht.write(ht_tmp) return hl.read_table(ht_tmp)
def test_concatenate(): x = np.array([[1., 2.], [3., 4.]]) y = np.array([[5.], [6.]]) np_res = np.concatenate([x, y], axis=1) res = hl.eval(hl.nd.concatenate([x, y], axis=1)) assert np.array_equal(np_res, res) res = hl.eval(hl.nd.concatenate(hl.array([x, y]), axis=1)) assert np.array_equal(np_res, res) x = np.array([[1], [3]]) y = np.array([[5], [6]]) seq = [x, y] seq2 = hl.array(seq) np_res = np.concatenate(seq) res = hl.eval(hl.nd.concatenate(seq)) assert np.array_equal(np_res, res) res = hl.eval(hl.nd.concatenate(seq2)) assert np.array_equal(np_res, res) seq = (x, y) seq2 = hl.array([x, y]) np_res = np.concatenate(seq) res = hl.eval(hl.nd.concatenate(seq)) assert np.array_equal(np_res, res) res = hl.eval(hl.nd.concatenate(seq2)) assert np.array_equal(np_res, res)
def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal)))
def test_to_table_on_various_fields(self): mt = hl.utils.range_matrix_table(3, 4) sample_ids = ['Bob', 'Alice', 'David', 'Carol'] entries = [1, 0, 3, 2] rows = ['1:3:A:G', '1:2:A:G', '1:0:A:G'] mt = mt.annotate_cols(s=hl.array(sample_ids)[mt.col_idx]).key_cols_by('s') mt = mt.annotate_entries(e=hl.array(entries)[mt.col_idx]) mt = mt.annotate_rows(r=hl.array(rows)[mt.row_idx]).key_rows_by('r') self.assertEqual(mt.s.collect(), sample_ids) self.assertEqual(mt.s.take(1), [sample_ids[0]]) self.assertEqual(mt.e.collect(), entries * 3) self.assertEqual(mt.e.take(1), [entries[0]]) self.assertEqual(mt.row_idx.collect(), [2, 1, 0]) self.assertEqual(mt.r.collect(), sorted(rows)) self.assertEqual(mt.r.take(1), [sorted(rows)[0]]) self.assertEqual(mt.cols().s.collect(), sorted(sample_ids)) self.assertEqual(mt.cols().s.take(1), [sorted(sample_ids)[0]]) self.assertEqual(mt.entries().e.collect(), sorted(entries) * 3) self.assertEqual(mt.entries().e.take(1), [sorted(entries)[0]]) self.assertEqual(mt.rows().row_idx.collect(), [2, 1, 0]) self.assertEqual(mt.rows().r.collect(), sorted(rows)) self.assertEqual(mt.rows().r.take(1), [sorted(rows)[0]])
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))), info=hl.struct( DP=hl.sum(ts.data.map(lambda d: d.info.DP)), MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB[0])), hl.sum(ts.data.map(lambda d: d.info.SB[1])), hl.sum(ts.data.map(lambda d: d.info.SB[2])), hl.sum(ts.data.map(lambda d: d.info.SB[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]])))), hl.dict(hl.range(0, hl.len(tmp.alleles)).map( lambda j: hl.tuple([tmp.alleles[j], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index( hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]))
def test_table_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20).rows() self.assertEqual( hl.filter_intervals( ds, [hl.parse_locus_interval('20:10639222-10644705')]).count(), 3) intervals = [ hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705') ] self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = hl.array([ hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705') ]) self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = hl.array([ hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705') ]) self.assertEqual(hl.filter_intervals(ds, intervals).count(), 3) intervals = [ hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]')) ] self.assertEqual(hl.filter_intervals(ds, intervals).count(), 4)
def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [ (agg.group_by( t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), { 0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0] }), (agg.group_by( t.col_idx % 3, agg.filter( t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), { 0: [10, 0], 1: [0], 2: [9, 0] }), (agg.group_by( t.col_idx % 3, agg.explode( lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), { 0: [10, 11, 0], 1: [0], 2: [9, 10, 0] }), ] for aggregation, expected in tests: self.assertEqual( t.select_rows(result=aggregation).result.collect()[0], expected)
def import_cadd_table(path: str, genome_version: str, partitions) -> hl.Table: if genome_version not in ("37", "38"): raise ValueError(f"Invalid genome version: {genome_version}") column_names = {'f0': 'chrom', 'f1': 'pos', 'f2': 'ref', 'f3': 'alt', 'f4': 'RawScore', 'f5': 'PHRED'} types = {'f0': hl.tstr, 'f1': hl.tint, 'f4': hl.tfloat32, 'f5': hl.tfloat32} cadd_ht = import_table(path, force_bgz=True, comment="#", no_header=True, types=types, min_partitions=partitions) cadd_ht = cadd_ht.rename(column_names) chrom = hl.format("chr%s", cadd_ht.chrom) if genome_version == "38" else cadd_ht.chrom locus = hl.locus(chrom, cadd_ht.pos, reference_genome=hl.get_reference(f"GRCh{genome_version}")) alleles = hl.array([cadd_ht.ref, cadd_ht.alt]) cadd_ht = cadd_ht.transmute(locus=locus, alleles=alleles) cadd_union_ht = cadd_ht.head(0) for contigs in (range(1, 10), list(range(10, 23)) + ["X", "Y", "MT"]): contigs = ["chr%s" % contig for contig in contigs] if genome_version == "38" else contigs cadd_ht_subset = cadd_ht.filter(hl.array(list(map(str, contigs))).contains(cadd_ht.locus.contig)) cadd_union_ht = cadd_union_ht.union(cadd_ht_subset) cadd_union_ht = cadd_union_ht.key_by("locus", "alleles") cadd_union_ht.describe() return cadd_union_ht
def combine(ts): # pylint: disable=protected-access tmp = ts.annotate( alleles=merge_alleles(ts.data.map(lambda d: d.alleles)), rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)), info=hl.struct( MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)), QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)), RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)), VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)), SB_TABLE=hl.array([ hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[0])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[1])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[2])), hl.sum(ts.data.map(lambda d: d.info.SB_TABLE[3])) ]))) tmp = tmp.annotate( __entries=hl.bind( lambda combined_allele_index: hl.range(0, hl.len(tmp.data)).flatmap( lambda i: hl.cond(hl.is_missing(tmp.data[i].__entries), hl.range(0, hl.len(tmp.g[i].__cols)) .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)), hl.bind( lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)), hl.array([0]).extend( hl.range(0, hl.len(tmp.data[i].alleles)).map( lambda j: combined_allele_index[tmp.data[i].alleles[j]]))))), hl.dict(hl.range(1, hl.len(tmp.alleles) + 1).map( lambda j: hl.tuple([tmp.alleles[j - 1], j]))))) tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols))) return tmp.drop('data', 'g')
def test_ndarray_eval(): data_list = [[1, 2, 3], [4, 5, 6], [7, 8, 9]] mishapen_data_list1 = [[4], [1, 2, 3]] mishapen_data_list2 = [[[1], [2, 3]]] mishapen_data_list3 = [[4], [1, 2, 3], 5] nd_expr = hl.nd.array(data_list) evaled = hl.eval(nd_expr) np_equiv = np.array(data_list, dtype=np.int32) np_equiv_fortran_style = np.asfortranarray(np_equiv) np_equiv_extra_dimension = np_equiv.reshape((3, 1, 3)) assert (np.array_equal(evaled, np_equiv)) assert (evaled.strides == np_equiv.strides) assert hl.eval(hl.nd.array([[], []])).strides == (8, 8) assert np.array_equal(hl.eval(hl.nd.array([])), np.array([])) zero_array = np.zeros((10, 10), dtype=np.int64) evaled_zero_array = hl.eval(hl.literal(zero_array)) assert np.array_equal(evaled_zero_array, zero_array) assert zero_array.dtype == evaled_zero_array.dtype # Testing correct interpretation of numpy strides assert np.array_equal(hl.eval(hl.literal(np_equiv_fortran_style)), np_equiv_fortran_style) assert np.array_equal(hl.eval(hl.literal(np_equiv_extra_dimension)), np_equiv_extra_dimension) # Testing from hail arrays assert np.array_equal(hl.eval(hl.nd.array(hl.range(6))), np.arange(6)) assert np.array_equal(hl.eval(hl.nd.array(hl.int64(4))), np.array(4)) # Testing from nested hail arrays assert np.array_equal( hl.eval(hl.nd.array(hl.array([hl.array(x) for x in data_list]))), np.arange(9).reshape((3, 3)) + 1) # Testing missing data assert hl.eval(hl.nd.array(hl.null(hl.tarray(hl.tint32)))) is None with pytest.raises(ValueError) as exc: hl.nd.array(mishapen_data_list1) assert "inner dimensions do not match" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.nd.array(hl.array(mishapen_data_list1))) assert "inner dimensions do not match" in str(exc.value) with pytest.raises(FatalError) as exc: hl.eval(hl.nd.array(hl.array(mishapen_data_list2))) assert "inner dimensions do not match" in str(exc.value) with pytest.raises(ValueError) as exc: hl.nd.array(mishapen_data_list3) assert "inner dimensions do not match" in str(exc.value)
def explode(self, f, array_agg_expr): if len(array_agg_expr._ir.search(lambda n: isinstance(n, BaseApplyAggOp))) != 0: raise ExpressionException("'{}.explode' does not support an already-aggregated expression as the argument to 'collection'".format(self.correct_prefix())) _check_agg_bindings(array_agg_expr, self._agg_bindings) if isinstance(array_agg_expr.dtype, tset): array_agg_expr = hl.array(array_agg_expr) elt = array_agg_expr.dtype.element_type var = Env.get_uid() ref = construct_expr(Ref(var), elt, array_agg_expr._indices) self._agg_bindings.add(var) aggregated = f(ref) _check_agg_bindings(aggregated, self._agg_bindings) self._agg_bindings.remove(var) if len(aggregated._ir.search(lambda n: isinstance(n, BaseApplyAggOp))) == 0: raise ExpressionException("'{}.explode' must take mapping that contains aggregation expression.".format(self.correct_prefix())) indices, _ = unify_all(array_agg_expr, aggregated) aggregations = hl.utils.LinkedList(Aggregation) if not self._as_scan: aggregations = aggregations.push(Aggregation(array_agg_expr, aggregated)) return construct_expr(AggExplode(array_agg_expr._ir, var, aggregated._ir), aggregated.dtype, aggregated._indices, aggregations)
def main(args): full_vcf = hl.read_matrix_table(args.allreads_prefix + '.mt') # liftover chains rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg37.add_liftover( 'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38) chips = hl.hadoop_open(args.chip_loci) chip_dict = {} for chip in chips: chip = chip.strip().split() chip_pos = hl.import_table(chip[1], filter='\[Controls\]', skip_blank_lines=True) chip_pos = chip_pos.filter( hl.array(list(map(str, range(1, 23))) + ['X', 'Y']).contains( chip_pos.chr)) chip_pos = chip_pos.key_by( locus=hl.locus(chip_pos.chr, hl.int(chip_pos.pos))) # liftover chip position info chip_pos = chip_pos.annotate( new_locus=hl.liftover(chip_pos.locus, 'GRCh38')) chip_pos = chip_pos.filter(hl.is_defined(chip_pos.new_locus)) chip_pos = chip_pos.key_by(locus=chip_pos.new_locus) # filter full vcf to sites in genotype data geno_vcf = full_vcf.filter_rows(hl.is_defined( chip_pos[full_vcf.locus])) hl.export_vcf( geno_vcf, 'gs://neurogap/high_coverage/NeuroGap_30x_' + chip[0] + '.vcf.bgz')
def load_cmg(cmg_csv: str) -> hl.Table: cmg_ht = hl.import_table(cmg_csv, impute=True, delimiter=",", quote='"') cmg_ht = cmg_ht.transmute( locus1_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_1), cmg_ht.pos_1, reference_genome='GRCh38'), alleles1_b38=[cmg_ht.ref_1, cmg_ht.alt_1], locus2_b38=hl.locus("chr" + hl.str(cmg_ht.chrom_2), cmg_ht.pos_2, reference_genome='GRCh38'), alleles2_b38=[cmg_ht.ref_2, cmg_ht.alt_2] ) liftover_references = get_liftover_genome(cmg_ht.rename({'locus1_b38': 'locus'})) lifted_over_variants = hl.sorted( hl.array([ liftover_expr(cmg_ht.locus1_b38, cmg_ht.alleles1_b38, liftover_references[1]), liftover_expr(cmg_ht.locus2_b38, cmg_ht.alleles2_b38, liftover_references[1]) ]), lambda x: x.locus ) cmg_ht = cmg_ht.key_by( locus1=lifted_over_variants[0].locus, alleles1=lifted_over_variants[0].alleles, locus2=lifted_over_variants[1].locus, alleles2=lifted_over_variants[1].alleles ) return cmg_ht.annotate( bad_liftover=( hl.is_missing(cmg_ht.locus1) | hl.is_missing(cmg_ht.locus2) | (cmg_ht.locus1.sequence_context() != cmg_ht.alleles1[0][0]) | (cmg_ht.locus2.sequence_context() != cmg_ht.alleles2[0][0]) ) )
def transform_one(mt: MatrixTable) -> MatrixTable: """transforms a gvcf into a form suitable for combining""" mt = mt.annotate_entries( # local (alt) allele index into global (alt) alleles LA=hl.range(0, hl.len(mt.alleles)), END=mt.info.END, BaseQRankSum=mt.info['BaseQRankSum'], ClippingRankSum=mt.info['ClippingRankSum'], MQ=mt.info['MQ'], MQRankSum=mt.info['MQRankSum'], ReadPosRankSum=mt.info['ReadPosRankSum'], ) mt = mt.annotate_rows( info=mt.info.annotate( SB_TABLE=hl.array([ hl.agg.sum(mt.entry.SB[0]), hl.agg.sum(mt.entry.SB[1]), hl.agg.sum(mt.entry.SB[2]), hl.agg.sum(mt.entry.SB[3]), ]) ).select( "MQ_DP", "QUALapprox", "RAW_MQ", "VarDP", "SB_TABLE", )) mt = mt.transmute_entries( LGT=mt.GT, LAD=mt.AD[0:], # requiredness issues :'( LPL=mt.PL[0:], LPGT=mt.PGT) mt = mt.drop('SB', 'qual', 'filters') return mt
def annotate_with_genotype_num_alt(mt: hl.MatrixTable) -> hl.MatrixTable: if 'AD' in set(mt.entry): # GATK-consistent VCF mt = mt.annotate_rows(genotypes=(hl.agg.collect( hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0, mt.GT.n_alt_alleles()), ab=hl.cond( mt.alleles[1] == '<CNV>', 0.0, hl.float(hl.array(mt.AD)[1]) / hl.float(hl.fold(lambda i, j: i + j, 0, mt.AD))), gq=mt.GQ, sample_id=mt.s, dp=mt.DP)))) elif 'AO' in set(mt.entry): mt = mt.annotate_rows( genotypes=hl.agg.collect( hl.struct(num_alt=hl.cond(mt.alleles[1] == '<CNV>', 0, mt.GT.n_alt_alleles()), ab=hl.cond(mt.alleles[1] == '<CNV>' or mt.DP == 0, 0.0, hl.float(mt.AO[0]) / hl.float(mt.DP)), dp=mt.DP, gq=mt.GQ, sample_id=mt.s)) ) #hl.cond(mt.GT=="0/0",0,hl.cond(mt.GT=="1/0",1,hl.cond(mt.GT=="0/1",1,hl.cond((mt.GT=="1/1",2,hl.cond(mt.GT=="1/2",2,hl.cond(mt.GT=="2/1",2,hl.cond(mt.GT=="2/2",2,-1)))))))) else: raise ValueError("unrecognized vcf") return mt
def remove_FT_values( mt: hl.MatrixTable, filters_to_remove: list = [ 'possible_numt', 'mt_many_low_hets', 'FAIL', 'blacklisted_site' ] ) -> hl.MatrixTable: """Removes the FT filters specified in filters_to_remove By default, this function removes the 'possible_numt', 'mt_many_low_hets', and 'FAIL' filters (because these filters were found to have low performance), and the 'blacklisted_site' filter because this filter did not always behave as expected in early GATK versions (can be replaced with apply_mito_artifact_filter function) :param hl.MatrixTable mt: MatrixTable :param list filters_to_remove: list of FT filters that should be removed from the entries :return: MatrixTable with certain FT filters removed :rtype: MatrixTable """ filters_to_remove = hl.set(filters_to_remove) mt = mt.annotate_entries( FT=hl.array((mt.FT).difference(filters_to_remove))) # if no filters exists after removing those specified above, set the FT field to PASS mt = mt.annotate_entries( FT=hl.if_else(hl.len(mt.FT) == 0, ["PASS"], mt.FT)) return (mt)
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression: """Downsample (x, y) coordinate datapoints. Parameters --------- x : :class:`.NumericExpression` X-values to be downsampled. y : :class:`.NumericExpression` Y-values to be downsampled. label : :class:`.StringExpression` or :class:`.ArrayExpression` Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`. n_divisions : :obj:`int` Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`.ArrayExpression` Expression for downsampled coordinate points (x, y). The element type of the array is :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring` """ if label is None: label = hl.null(hl.tarray(hl.tstr)) elif isinstance(label, StringExpression): label = hl.array([label]) return _agg_func('downsample', [x, y, label], tarray(ttuple(tfloat64, tfloat64, tarray(tstr))), constructor_args=[n_divisions])
def explode(self, f, array_agg_expr): if len( array_agg_expr._ir.search( lambda n: isinstance(n, BaseApplyAggOp))) != 0: raise ExpressionException( "'{}.explode' does not support an already-aggregated expression as the argument to 'collection'" .format(self.correct_prefix())) if isinstance(array_agg_expr.dtype, tset): array_agg_expr = hl.array(array_agg_expr) elt = array_agg_expr.dtype.element_type var = Env.get_uid() ref = construct_expr(Ref(var), elt, array_agg_expr._indices) self._agg_bindings.add(var) aggregated = f(ref) self._agg_bindings.remove(var) if len(aggregated._ir.search( lambda n: isinstance(n, BaseApplyAggOp))) == 0: raise ExpressionException( "'{}.explode' must take mapping that contains aggregation expression." .format(self.correct_prefix())) indices, _ = unify_all(array_agg_expr, aggregated) aggregations = hl.utils.LinkedList(Aggregation) if not self._as_scan: aggregations = aggregations.push( Aggregation(array_agg_expr, aggregated)) return construct_expr( AggExplode(array_agg_expr._ir, var, aggregated._ir), aggregated.dtype, aggregated._indices, aggregations)
def concordance_frequency(full_vcf, concordance_table, output): full_variant_qc = full_vcf.rows() concordance_qc = full_variant_qc.annotate( concordance=concordance_table[full_variant_qc.key]) freqs = list(np.linspace(0.5, 0, num=91)) ## note, this will need to be updated concordance_stats = concordance_qc.group_by( freq=hl.array(freqs).find( lambda x: concordance_qc.variant_qc.AF[1] >= x), snp=hl.is_snp( concordance_qc.alleles[0], concordance_qc.alleles[1])).aggregate( n_variants=hl.agg.count(), unique_variants=hl.agg.array_agg( lambda row: hl.agg.array_agg( lambda element: hl.agg.count_where(element > 0), row), concordance_qc.concordance.concordance), geno_concordance=hl.agg.array_agg( lambda row: hl.agg.array_agg( lambda element: hl.agg.sum(element), row), concordance_qc.concordance.concordance)) concordance_stats = concordance_stats.annotate( total_concordant=concordance_stats.geno_concordance[3][3] + concordance_stats.geno_concordance[4][4], total_discordant=concordance_stats.geno_concordance[2][3] + concordance_stats.geno_concordance[2][4] + concordance_stats.geno_concordance[3][2] + concordance_stats.geno_concordance[3][4] + concordance_stats.geno_concordance[4][2] + concordance_stats.geno_concordance[4][3]) concordance_stats = concordance_stats.annotate( non_ref_concordance=concordance_stats.total_concordant / (concordance_stats.total_concordant + concordance_stats.total_discordant)) concordance_stats.export(output + 'variants.tsv')
def test_import_keyby_count_ldsc_lowered_shuffle(self): # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles # and RDD serialization, 2021-07-06 # if this comment no longer reflects the backend system, that's a really good thing ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={ 'L2': hl.tfloat, 'BP': hl.tint }) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={ 'N': hl.tint, 'Z': hl.tfloat }) j1 = ht_scores[ht_20160['SNP']] ht_20160 = ht_20160.annotate(ld_score=j1['L2'], locus=hl.locus(j1['CHR'], j1['BP']), alleles=hl.array( [ht_20160['A2'], ht_20160['A1']])) ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) assert ht_20160._force_count() == 151
def downsample(x, y, label=None, n_divisions=500) -> ArrayExpression: """Downsample (x, y) coordinate datapoints. Parameters --------- x : :class:`.NumericExpression` X-values to be downsampled. y : :class:`.NumericExpression` Y-values to be downsampled. label : :class:`.StringExpression` or :class:`.ArrayExpression` Additional data for each (x, y) coordinate. Can pass in multiple fields in an :class:`.ArrayExpression`. n_divisions : :obj:`int` Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`.ArrayExpression` Expression for downsampled coordinate points (x, y). The element type of the array is :py:data:`.ttuple` of :py:data:`.tfloat64`, :py:data:`.tfloat64`, and :py:data:`.tarray` of :py:data:`.tstring` """ if label is None: label = hl.null(hl.tarray(hl.tstr)) elif isinstance(label, StringExpression): label = hl.array([label]) return _agg_func('downsample', _to_agg(x), tarray(ttuple(tfloat64, tfloat64, tarray(tstr))), constructor_args=[n_divisions], seq_op_args=[lambda x: x, y, label])
def explode_duplicate_samples_ht(dups_ht: hl.Table) -> hl.Table: """ Explodes the result of `get_duplicated_samples_ht`, so that each line contains a single sample. An additional annotation is added: `dup_filtered` indicating which of the duplicated samples was kept. Requires a field `filtered` which type should be the same as the input duplicated samples Table key. :param dups_ht: Input HT :return: Flattened HT """ def get_dups_to_keep_expr(): if dups_ht.filtered.dtype.element_type == dups_ht.key.dtype: return (dups_ht.key, False) elif (len(dups_ht.key) == 1) & (dups_ht.filtered.dtype.element_type == dups_ht.key[0].dtype): return (dups_ht.key[0], False) else: raise TypeError( f"Cannot explode table as types of the filtered field ({dups_ht.filtered.dtype}) and the key ({dups_ht.key.dtype}) are incompatible." ) dups_ht = dups_ht.annotate(dups=hl.array([get_dups_to_keep_expr()]).extend( dups_ht.filtered.map(lambda x: (x, True)))) dups_ht = dups_ht.explode("dups") dups_ht = dups_ht.key_by() return dups_ht.select(s=dups_ht.dups[0], dup_filtered=dups_ht.dups[1]).key_by("s")
def compute_prs_mt(genotype_mt_path, prs_mt_path): scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch' clumped = hl.read_table( 'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/' ) sumstats = hl.import_table( 'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz', impute=True) sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos), alleles=hl.array([sumstats.ref, sumstats.alt])) sumstats = sumstats.key_by('locus', 'alleles') sumstats.describe() # mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset # get full genotype mt meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) mt = get_filtered_mt_with_x() mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) mt = mt.select_entries('dosage') mt = mt.select_rows() mt = mt.select_cols() mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]), sumstats[mt.row_key].beta_meta, 0)) mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage)) mt_cols = mt.cols() mt_cols = mt_cols.repartition(1000) mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
def _linreg(y, x, nested_dim): k = len(x) k0 = nested_dim if k0 < 0 or k0 > k: raise ValueError( "linreg: `nested_dim` must be between 0 and the number " f"of covariates ({k}), inclusive") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), multiple_standard_error=hl.tfloat64, multiple_r_squared=hl.tfloat64, adjusted_r_squared=hl.tfloat64, f_stat=hl.tfloat64, multiple_p_value=hl.tfloat64, n=hl.tint64) x = hl.array(x) k = hl.int32(k) k0 = hl.int32(k0) return _agg_func('LinearRegression', _to_agg(y), t, [k, k0], seq_op_args=[lambda y: y, x])
def _coerce(self, x: Expression): assert isinstance(x, hl.expr.DictExpression) if not self.kc._requires_conversion(x.dtype.key_type): # fast path return x.map_values(self.vc.coerce) else: return hl.dict(hl.map(lambda e: (self.kc.coerce(e[0]), self.vc.coerce(e[1])), hl.array(x)))
def phase_diploid_proband( locus: hl.expr.LocusExpression, alleles: hl.expr.ArrayExpression, proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a diploid proband (autosomes, PAR regions of sex chromosomes or non-PAR regions of a female proband) :param LocusExpression locus: Locus in the trio MatrixTable :param ArrayExpression alleles: Alleles in the trio MatrixTable :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ proband_v = proband_call.one_hot_alleles(alleles) father_v = hl.cond( locus.in_x_nonpar() | locus.in_y_nonpar(), hl.or_missing(father_call.is_haploid(), hl.array([father_call.one_hot_alleles(alleles)])), call_to_one_hot_alleles_array(father_call, alleles) ) mother_v = call_to_one_hot_alleles_array(mother_call, alleles) combinations = hl.flatmap( lambda f: hl.zip_with_index(mother_v) .filter(lambda m: m[1] + f[1] == proband_v) .map(lambda m: hl.struct(m=m[0], f=f[0])), hl.zip_with_index(father_v) ) return ( hl.or_missing( hl.is_defined(combinations) & (hl.len(combinations) == 1), hl.array([ hl.call(father_call[combinations[0].f], mother_call[combinations[0].m], phased=True), hl.cond(father_call.is_haploid(), hl.call(father_call[0], phased=True), phase_parent_call(father_call, combinations[0].f)), phase_parent_call(mother_call, combinations[0].m) ]) ) )
def fix_alleles(alleles): ref = alleles.map(lambda d: d.ref).fold( lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), '') alts = alleles.map(lambda a: hl.switch(hl.allele_type( a.ref, a.alt)).when('SNP', a.alt + ref[hl.len(a.alt):]).when( 'Insertion', a.alt + ref[hl.len(a.ref):]).when( 'Deletion', a.alt + ref[hl.len(a.ref):]).default(a.alt)) return hl.array([ref]).extend(alts)
def test_complex_round_trips(): assert_round_trip(hl.struct()) assert_round_trip(hl.empty_array(hl.tint32)) assert_round_trip(hl.empty_set(hl.tint32)) assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32)) assert_round_trip(hl.locus('1', 100)) assert_round_trip(hl.struct(x=3)) assert_round_trip(hl.set([3, 4, 5, 3])) assert_round_trip(hl.array([3, 4, 5])) assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'})) assert_round_trip( hl.struct(x=hl.dict({ 3: 'a', 4: 'b', 5: 'c' }), y=hl.array([3, 4, 5]), z=hl.set([3, 4, 5, 3])))
def explode_phase_info(ht: hl.Table, remove_all_ref: bool = True) -> hl.Table: ht = ht.transmute(phase_info=hl.array(ht.phase_info)) ht = ht.explode('phase_info') ht = ht.transmute(pop=ht.phase_info[0], phase_info=ht.phase_info[1]) if remove_all_ref: ht = ht.filter(hl.sum(ht.phase_info.gt_counts.raw[1:]) > 0) return ht
def test_agg_cols_group_by(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.group_by(t.col_idx % 2, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), {0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0]}), (agg.group_by(t.col_idx % 3, agg.filter(t.col_idx > 7, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {0: [10, 0], 1: [0], 2: [9, 0]}), (agg.group_by(t.col_idx % 3, agg.explode(lambda elt: agg.collect(elt + 1).append(0), hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32)))), {0: [10, 11, 0], 1: [0], 2:[9, 10, 0]}), ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
def main(args): # init hail hl.init(default_reference=args.default_ref_genome) # input MT mt = hl.read_matrix_table(args.mt_input_path) # filter high-quality genotype # mt = filter_genotypes_ab(mt) # import capture interval table (intersect) intervals = hl.read_table(args.ht_intervals) # generate an interval x sample MT by computing per intervals callrate mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals) # run pca eigenvalues, ht_pca, _ = run_platform_pca( callrate_mt=mt_callrate, binarization_threshold=args.binarization_threshold) # normalize eigenvalues (0-100) eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues] # compute eigenvalues cumulative sum ev_cumsum = hl.array_scan(lambda i, j: i + j, 0, hl.array(eigenvalues_norm)) # getting optimal number of PCs (those which explain 99% of the variance) n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0))) logger.info( f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}" ) # filter out uninformative PCs ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs]) # apply unsupervised clustering on PCs to infer samples platform ht_platform = assign_platform_from_pcs( platform_pca_scores_ht=ht_pca, pc_scores_ann='scores', hdbscan_min_cluster_size=args.hdbscan_min_cluster_size, hdbscan_min_samples=args.hdbscan_min_cluster_size) ht_platform.show() # write HT ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite) # export to file if true if args.write_to_file: (ht_platform.export(f'{args.ht_output_path}.tsv.bgz')) hl.stop()
def project_max_expr( project_expr: hl.expr.StringExpression, gt_expr: hl.expr.CallExpression, alleles_expr: hl.expr.ArrayExpression, n_projects: int = 5, ) -> hl.expr.ArrayExpression: """ Create an expression that computes allele frequency information by project for the `n_projects` with the largest AF at this row. Will return an array with one element per non-reference allele. Each of these elements is itself an array of structs with the following fields: - AC: int32 - AF: float64 - AN: int32 - homozygote_count: int32 - project: str .. note:: Only projects with AF > 0 are returned. In case of ties, the project ordering is not guaranteed, and at most `n_projects` are returned. :param project_expr: column expression containing the project :param gt_expr: entry expression containing the genotype :param alleles_expr: row expression containing the alleles :param n_projects: Maximum number of projects to return for each row :return: projectmax expression """ n_alleles = hl.len(alleles_expr) # compute call stats by project project_cs = hl.array( hl.agg.group_by(project_expr, hl.agg.call_stats(gt_expr, alleles_expr))) return hl.or_missing( n_alleles > 1, # Exclude monomorphic sites hl.range(1, n_alleles).map(lambda ai: hl.sorted( project_cs.filter( # filter to projects with AF > 0 lambda x: x[1].AF[ai] > 0), # order the callstats computed by AF in decreasing order lambda x: -x[1].AF[ai] # take the n_projects projects with largest AF )[:n_projects].map( # add the project in the callstats struct lambda x: x[1].annotate( AC=x[1].AC[ai], AF=x[1].AF[ai], AN=x[1].AN, homozygote_count=x[1].homozygote_count[ai], project=x[0], ))), )
def call_to_one_hot_alleles_array(call: hl.expr.CallExpression, alleles: hl.expr.ArrayExpression) -> hl.expr.ArrayExpression: """ Get the set of all different one-hot-encoded allele-vectors in a genotype call. It is returned as an ordered array where the first vector corresponds to the first allele, and the second vector (only present if het) the second allele. :param CallExpression call: genotype :param ArrayExpression alleles: Alleles at the site :return: Array of one-hot-encoded alleles :rtype: ArrayExpression """ return hl.cond( call.is_het(), hl.array([ hl.call(call[0]).one_hot_alleles(alleles), hl.call(call[1]).one_hot_alleles(alleles), ]), hl.array([hl.call(call[0]).one_hot_alleles(alleles)]) )
def test_multi_way_zip_join_globals(self): t1 = hl.utils.range_table(1).annotate_globals(x=hl.null(hl.tint32)) t2 = hl.utils.range_table(1).annotate_globals(x=5) t3 = hl.utils.range_table(1).annotate_globals(x=0) expected = hl.struct(__globals=hl.array([ hl.struct(x=hl.null(hl.tint32)), hl.struct(x=5), hl.struct(x=0)])) joined = hl.Table._multi_way_zip_join([t1, t2, t3], '__data', '__globals') self.assertEqual(hl.eval(joined.globals), hl.eval(expected))
def test_matrix_filter_intervals(self): ds = hl.import_vcf(resource('sample.vcf'), min_partitions=20) self.assertEqual( hl.filter_intervals(ds, [hl.parse_locus_interval('20:10639222-10644705')]).count_rows(), 3) intervals = [hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.parse_locus_interval('20:10639222-10644700'), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = hl.array([hl.eval(hl.parse_locus_interval('20:10639222-10644700')), hl.parse_locus_interval('20:10644700-10644705')]) self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 3) intervals = [hl.eval(hl.parse_locus_interval('[20:10019093-10026348]')), hl.eval(hl.parse_locus_interval('[20:17705793-17716416]'))] self.assertEqual(hl.filter_intervals(ds, intervals).count_rows(), 4)
def create_all_values(): return hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) )
def create_all_values_datasets(): all_values = hl.struct( f32=hl.float32(3.14), i64=hl.int64(-9), m=hl.null(hl.tfloat64), astruct=hl.struct(a=hl.null(hl.tint32), b=5.5), mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)), aset=hl.set(['foo', 'bar', 'baz']), mset=hl.null(hl.tset(hl.tfloat64)), d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}), md=hl.null(hl.tdict(hl.tint32, hl.tstr)), h38=hl.locus('chr22', 33878978, 'GRCh38'), ml=hl.null(hl.tlocus('GRCh37')), i=hl.interval( hl.locus('1', 999), hl.locus('1', 1001)), c=hl.call(0, 1), mc=hl.null(hl.tcall), t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]), mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool)) ) def prefix(s, p): return hl.struct(**{p + k: s[k] for k in s}) all_values_table = (hl.utils.range_table(5, n_partitions=3) .annotate_globals(**prefix(all_values, 'global_')) .annotate(**all_values) .cache()) all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2) .annotate_globals(**prefix(all_values, 'global_')) .annotate_rows(**prefix(all_values, 'row_')) .annotate_cols(**prefix(all_values, 'col_')) .annotate_entries(**prefix(all_values, 'entry_')) .cache()) return all_values_table, all_values_matrix_table
def quick_summary(mt): """compute aggregate INFO fields that do not require densify""" return mt.annotate_rows( info=hl.struct( MQ_DP=hl.agg.sum(mt.entry.gvcf_info.MQ_DP), QUALapprox=hl.agg.sum(mt.entry.gvcf_info.QUALapprox), RAW_MQ=hl.agg.sum(mt.entry.gvcf_info.RAW_MQ), VarDP=hl.agg.sum(mt.entry.gvcf_info.VarDP), SB_TABLE=hl.array([ hl.agg.sum(mt.entry.SB[0]), hl.agg.sum(mt.entry.SB[1]), hl.agg.sum(mt.entry.SB[2]), hl.agg.sum(mt.entry.SB[3]), ])))
def phase_haploid_proband_x_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, mother_call: hl.expr.CallExpression ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the case of a haploid proband in the non-PAR region of X :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :param CallExpression mother_call: Input mother genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ transmitted_allele = hl.zip_with_index(hl.array([mother_call[0], mother_call[1]])).find(lambda m: m[1] == proband_call[0]) return hl.or_missing( hl.is_defined(transmitted_allele), hl.array([ hl.call(proband_call[0], phased=True), hl.or_missing(father_call.is_haploid(), hl.call(father_call[0], phased=True)), phase_parent_call(mother_call, transmitted_allele[0]) ]) )
def test_agg_cols_filter(self): t = hl.utils.range_matrix_table(1, 10) tests = [(agg.filter(t.col_idx > 7, agg.collect(t.col_idx + 1).append(0)), [9, 10, 0]), (agg.filter(t.col_idx > 7, agg.explode(lambda elt: agg.collect(elt + 1).append(0), [t.col_idx, t.col_idx + 1])), [9, 10, 10, 11, 0]), (agg.filter(t.col_idx > 7, agg.group_by(t.col_idx % 3, hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {0: [10, 0], 2: [9, 0]}) ] for aggregation, expected in tests: self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
def generate_random_gen(): mt = hl.utils.range_matrix_table(30, 10) mt = (mt.annotate_rows(locus = hl.locus('20', mt.row_idx + 1), alleles = ['A', 'G']) .key_rows_by('locus', 'alleles')) mt = (mt.annotate_cols(s = hl.str(mt.col_idx)) .key_cols_by('s')) # using totally random values leads rounding differences where # identical GEN values get rounded differently, leading to # differences in the GT call between import_{gen, bgen} mt = mt.annotate_entries(a = hl.int32(hl.rand_unif(0.0, 255.0))) mt = mt.annotate_entries(b = hl.int32(hl.rand_unif(0.0, 255.0 - mt.a))) mt = mt.transmute_entries(GP = hl.array([mt.a, mt.b, 255.0 - mt.a - mt.b]) / 255.0) # 20% missing mt = mt.filter_entries(hl.rand_bool(0.8)) hl.export_gen(mt, 'random', precision=4)
def phase_y_nonpar( proband_call: hl.expr.CallExpression, father_call: hl.expr.CallExpression, ) -> hl.expr.ArrayExpression: """ Returns phased genotype calls in the non-PAR region of Y (requires both father and proband to be haploid to return phase) :param CallExpression proband_call: Input proband genotype call :param CallExpression father_call: Input father genotype call :return: Array containing: phased proband call, phased father call, phased mother call :rtype: ArrayExpression """ return hl.or_missing( proband_call.is_haploid() & father_call.is_haploid() & (father_call[0] == proband_call[0]), hl.array([ hl.call(proband_call[0], phased=True), hl.call(father_call[0], phased=True), hl.null(hl.tcall) ]) )
def _linreg(y, x, nested_dim): k = len(x) k0 = nested_dim if k0 < 0 or k0 > k: raise ValueError("linreg: `nested_dim` must be between 0 and the number " f"of covariates ({k}), inclusive") t = hl.tstruct(beta=hl.tarray(hl.tfloat64), standard_error=hl.tarray(hl.tfloat64), t_stat=hl.tarray(hl.tfloat64), p_value=hl.tarray(hl.tfloat64), multiple_standard_error=hl.tfloat64, multiple_r_squared=hl.tfloat64, adjusted_r_squared=hl.tfloat64, f_stat=hl.tfloat64, multiple_p_value=hl.tfloat64, n=hl.tint64) x = hl.array(x) k = hl.int32(k) k0 = hl.int32(k0) return _agg_func('LinearRegression', [y, x], t, [k, k0])
def full_outer_join_mt(left: hl.MatrixTable, right: hl.MatrixTable) -> hl.MatrixTable: """Performs a full outer join on `left` and `right`. Replaces row, column, and entry fields with the following: - `left_row` / `right_row`: structs of row fields from left and right. - `left_col` / `right_col`: structs of column fields from left and right. - `left_entry` / `right_entry`: structs of entry fields from left and right. Parameters ---------- left : :class:`.MatrixTable` right : :class:`.MatrixTable` Returns ------- :class:`.MatrixTable` """ if [x.dtype for x in left.row_key.values()] != [x.dtype for x in right.row_key.values()]: raise ValueError(f"row key types do not match:\n" f" left: {list(left.row_key.values())}\n" f" right: {list(right.row_key.values())}") if [x.dtype for x in left.col_key.values()] != [x.dtype for x in right.col_key.values()]: raise ValueError(f"column key types do not match:\n" f" left: {list(left.col_key.values())}\n" f" right: {list(right.col_key.values())}") left = left.select_rows(left_row=left.row) left_t = left.localize_entries('left_entries', 'left_cols') right = right.select_rows(right_row=right.row) right_t = right.localize_entries('right_entries', 'right_cols') ht = left_t.join(right_t, how='outer') ht = ht.annotate_globals( left_keys=hl.group_by( lambda t: t[0], hl.zip_with_index( ht.left_cols.map(lambda x: hl.tuple([x[f] for f in left.col_key])), index_first=False)).map_values( lambda elts: elts.map(lambda t: t[1])), right_keys=hl.group_by( lambda t: t[0], hl.zip_with_index( ht.right_cols.map(lambda x: hl.tuple([x[f] for f in right.col_key])), index_first=False)).map_values( lambda elts: elts.map(lambda t: t[1]))) ht = ht.annotate_globals( key_indices=hl.array(ht.left_keys.key_set().union(ht.right_keys.key_set())) .map(lambda k: hl.struct(k=k, left_indices=ht.left_keys.get(k), right_indices=ht.right_keys.get(k))) .flatmap(lambda s: hl.case() .when(hl.is_defined(s.left_indices) & hl.is_defined(s.right_indices), hl.range(0, s.left_indices.length()).flatmap( lambda i: hl.range(0, s.right_indices.length()).map( lambda j: hl.struct(k=s.k, left_index=s.left_indices[i], right_index=s.right_indices[j])))) .when(hl.is_defined(s.left_indices), s.left_indices.map( lambda elt: hl.struct(k=s.k, left_index=elt, right_index=hl.null('int32')))) .when(hl.is_defined(s.right_indices), s.right_indices.map( lambda elt: hl.struct(k=s.k, left_index=hl.null('int32'), right_index=elt))) .or_error('assertion error'))) ht = ht.annotate(__entries=ht.key_indices.map(lambda s: hl.struct(left_entry=ht.left_entries[s.left_index], right_entry=ht.right_entries[s.right_index]))) ht = ht.annotate_globals(__cols=ht.key_indices.map( lambda s: hl.struct(**{f: s.k[i] for i, f in enumerate(left.col_key)}, left_col=ht.left_cols[s.left_index], right_col=ht.right_cols[s.right_index]))) ht = ht.drop('left_entries', 'left_cols', 'left_keys', 'right_entries', 'right_cols', 'right_keys', 'key_indices') return ht._unlocalize_entries('__entries', '__cols', list(left.col_key))
def manhattan(pvals, locus=None, title=None, size=4, hover_fields=None, collect_all=False, n_divisions=500): """Create a Manhattan plot. (https://en.wikipedia.org/wiki/Manhattan_plot) Parameters ---------- pvals : :class:`.Float64Expression` P-values to be plotted. locus : :class:`.LocusExpression` Locus values to be plotted. title : str Title of the plot. size : int Size of markers in screen space units. hover_fields : Dict[str, :class:`.Expression`] Dictionary of field names and values to be shown in the HoverTool of the plot. collect_all : bool Whether to collect all values or downsample before plotting. n_divisions : int Factor by which to downsample (default value = 500). A lower input results in fewer output datapoints. Returns ------- :class:`bokeh.plotting.figure.Figure` """ def get_contig_index(x, starts): left = 0 right = len(starts) - 1 while left <= right: mid = (left + right) // 2 if x < starts[mid]: if x >= starts[mid - 1]: return mid - 1 right = mid elif x >= starts[mid+1]: left = mid + 1 else: return mid if locus is None: locus = pvals._indices.source.locus if hover_fields is None: hover_fields = {} hover_fields['locus'] = hail.str(locus) pvals = -hail.log10(pvals) if collect_all: res = hail.tuple([locus.global_position(), pvals, hail.struct(**hover_fields)]).collect() hf_struct = [point[2] for point in res] for key in hover_fields: hover_fields[key] = [item[key] for item in hf_struct] else: agg_f = pvals._aggregation_method() res = agg_f(aggregators.downsample(locus.global_position(), pvals, label=hail.array([hail.str(x) for x in hover_fields.values()]), n_divisions=n_divisions)) fields = [point[2] for point in res] for idx, key in enumerate(list(hover_fields.keys())): hover_fields[key] = [field[idx] for field in fields] x = [point[0] for point in res] y = [point[1] for point in res] y_linear = [10 ** (-p) for p in y] hover_fields['p_value'] = y_linear ref = locus.dtype.reference_genome total_pos = 0 start_points = [] for i in range(0, len(ref.contigs)): start_points.append(total_pos) total_pos += ref.lengths.get(ref.contigs[i]) start_points.append(total_pos) # end point of all contigs observed_contigs = set() label = [] for element in x: contig_index = get_contig_index(element, start_points) label.append(str(contig_index % 2)) observed_contigs.add(ref.contigs[contig_index]) labels = ref.contigs.copy() num_deleted = 0 mid_points = [] for i in range(0, len(ref.contigs)): if ref.contigs[i] in observed_contigs: length = ref.lengths.get(ref.contigs[i]) mid = start_points[i] + length / 2 if mid % 1 == 0: mid += 0.5 mid_points.append(mid) else: del labels[i - num_deleted] num_deleted += 1 p = scatter(x, y, label=label, title=title, xlabel='Chromosome', ylabel='P-value (-log10 scale)', size=size, legend=False, source_fields=hover_fields) p.xaxis.ticker = mid_points p.xaxis.major_label_overrides = dict(zip(mid_points, labels)) p.width = 1000 tooltips = [(key, "@{}".format(key)) for key in hover_fields] p.add_tools(HoverTool( tooltips=tooltips )) return p
def locus_windows(locus_expr, radius, coord_expr=None, _localize=True): """Returns start and stop indices for window around each locus. Examples -------- Windows with 2bp radius for one contig with positions 1, 2, 3, 4, 5: >>> starts, stops = hl.linalg.utils.locus_windows( ... hl.balding_nichols_model(1, 5, 5).locus, ... radius=2) >>> starts, stops (array([0, 0, 0, 1, 2]), array([3, 4, 5, 5, 5])) The following examples involve three contigs. >>> loci = [{'locus': hl.Locus('1', 1), 'cm': 1.0}, ... {'locus': hl.Locus('1', 2), 'cm': 3.0}, ... {'locus': hl.Locus('1', 4), 'cm': 4.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('2', 1), 'cm': 2.0}, ... {'locus': hl.Locus('3', 3), 'cm': 5.0}] >>> ht = hl.Table.parallelize( ... loci, ... hl.tstruct(locus=hl.tlocus('GRCh37'), cm=hl.tfloat64), ... key=['locus']) Windows with 1bp radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1) (array([0, 0, 2, 3, 3, 5]), array([2, 2, 3, 5, 5, 6])) Windows with 1cm radius: >>> hl.linalg.utils.locus_windows(ht.locus, 1.0, coord_expr=ht.cm) (array([0, 1, 1, 3, 3, 5]), array([1, 3, 3, 5, 5, 6])) Notes ----- This function returns two 1-dimensional ndarrays of integers, ``starts`` and ``stops``, each of size equal to the number of rows. By default, for all indices ``i``, ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``position[i] - radius <= position[j] <= position[i] + radius``. If the :meth:`.global_position` on `locus_expr` is not in ascending order, this method will fail. Ascending order should hold for a matrix table keyed by locus or variant (and the associated row table), or for a table that has been ordered by `locus_expr`. Set `coord_expr` to use a value other than position to define the windows. This row-indexed numeric expression must be non-missing, non-``nan``, on the same source as `locus_expr`, and ascending with respect to locus position for each contig; otherwise the function will fail. The last example above uses centimorgan coordinates, so ``[starts[i], stops[i])`` is the maximal range of row indices ``j`` such that ``contig[i] == contig[j]`` and ``cm[i] - radius <= cm[j] <= cm[i] + radius``. Index ranges are start-inclusive and stop-exclusive. This function is especially useful in conjunction with :meth:`.BlockMatrix.sparsify_row_intervals`. Parameters ---------- locus_expr : :class:`.LocusExpression` Row-indexed locus expression on a table or matrix table. radius: :obj:`int` Radius of window for row values. coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value. Must be on the same table or matrix table as `locus_expr`. By default, the row value is given by the locus position. Returns ------- (:class:`ndarray` of :obj:`int64`, :class:`ndarray` of :obj:`int64`) Tuple of start indices array and stop indices array. """ if radius < 0: raise ValueError(f"locus_windows: 'radius' must be non-negative, found {radius}") check_row_indexed('locus_windows', locus_expr) if coord_expr is not None: check_row_indexed('locus_windows', coord_expr) src = locus_expr._indices.source if locus_expr not in src._fields_inverse: locus = Env.get_uid() annotate_fields = {locus: locus_expr} if coord_expr is not None: if coord_expr not in src._fields_inverse: coords = Env.get_uid() annotate_fields[coords] = coord_expr else: coords = src._fields_inverse[coord_expr] if isinstance(src, hl.MatrixTable): new_src = src.annotate_rows(**annotate_fields) else: new_src = src.annotate(**annotate_fields) locus_expr = new_src[locus] if coord_expr is not None: coord_expr = new_src[coords] if coord_expr is None: coord_expr = locus_expr.position rg = locus_expr.dtype.reference_genome contig_group_expr = hl.agg.group_by(hl.locus(locus_expr.contig, 1, reference_genome=rg), hl.agg.collect(coord_expr)) # check loci are in sorted order last_pos = hl.fold(lambda a, elt: (hl.case() .when(a <= elt, elt) .or_error("locus_windows: 'locus_expr' global position must be in ascending order.")), -1, hl.agg.collect(hl.case() .when(hl.is_defined(locus_expr), locus_expr.global_position()) .or_error("locus_windows: missing value for 'locus_expr'."))) checked_contig_groups = (hl.case() .when(last_pos >= 0, contig_group_expr) .or_error("locus_windows: 'locus_expr' has length 0")) contig_groups = locus_expr._aggregation_method()(checked_contig_groups, _localize=False) coords = hl.sorted(hl.array(contig_groups)).map(lambda t: t[1]) starts_and_stops = hl._locus_windows_per_contig(coords, radius) if not _localize: return starts_and_stops starts, stops = hl.eval(starts_and_stops) return np.array(starts), np.array(stops)
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s']) -> hl.MatrixTable: """Splits a trio MatrixTable back into a sample MatrixTable. Example ------- >>> # Create a trio matrix from a sample matrix >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) >>> # Explode trio matrix back into a sample matrix >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset) Notes ----- This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`) In particular, it should have the following entry schema: - proband_entry - father_entry - mother_entry And the following column schema: - proband - father - mother Note ---- The only entries kept are `proband_entry`, `father_entry` and `mother_entry` are dropped. The only columns kepy are `proband`, `father` and `mother` Parameters ---------- tm : :class:`.MatrixTable` Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present) call_field : :obj:`list` of str Column key(s) for the resulting sample MatrixTable Returns ------- :class:`.MatrixTable` Sample MatrixTable""" tm = tm.select_entries( __trio_entries=hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry]) ) tm = tm.select_cols( __trio_members=hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother])) ) mt = tm.explode_cols(tm.__trio_members) mt = mt.select_entries( **mt.__trio_entries[mt.__trio_members[0]] ) mt = mt.key_cols_by() mt = mt.select_cols(**mt.__trio_members[1]) if col_keys: mt = mt.key_cols_by(*col_keys) return mt
def field_to_array(ds, field): return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
def merge_alleles(alleles) -> ArrayExpression: return hl.array(hl.set(hl.flatten(alleles)))
def test_ld_score_regression(self): ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint}) ht_50_irnt = hl.import_table( doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_50_irnt = ht_50_irnt.annotate( chi_squared=ht_50_irnt['Z']**2, n=ht_50_irnt['N'], ld_score=ht_scores[ht_50_irnt['SNP']]['L2'], locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'], ht_scores[ht_50_irnt['SNP']]['BP']), alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]), phenotype='50_irnt') ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'], ht_50_irnt['alleles']) ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'], ht_50_irnt['n'], ht_50_irnt['ld_score'], ht_50_irnt['phenotype']) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_20160 = ht_20160.annotate( chi_squared=ht_20160['Z']**2, n=ht_20160['N'], ld_score=ht_scores[ht_20160['SNP']]['L2'], locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'], ht_scores[ht_20160['SNP']]['BP']), alleles=hl.array([ht_20160['A2'], ht_20160['A1']]), phenotype='20160') ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) ht_20160 = ht_20160.select(ht_20160['chi_squared'], ht_20160['n'], ht_20160['ld_score'], ht_20160['phenotype']) ht = ht_50_irnt.union(ht_20160) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['phenotype'], row_fields=['ld_score'], col_fields=[]) mt_tmp = new_temp_file() mt.write(mt_tmp, overwrite=True) mt = hl.read_matrix_table(mt_tmp) ht_results = hl.experimental.ld_score_regression( weight_expr=mt['ld_score'], ld_score_expr=mt['ld_score'], chi_sq_exprs=mt['chi_squared'], n_samples_exprs=mt['n'], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results['50_irnt']['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results['20160']['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results['20160']['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results['20160']['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_standard_error'], 0.0416, places=4) ht = ht_50_irnt.annotate( chi_squared_50_irnt=ht_50_irnt['chi_squared'], n_50_irnt=ht_50_irnt['n'], chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'], n_20160=ht_20160[ht_50_irnt.key]['n']) ht_results = hl.experimental.ld_score_regression( weight_expr=ht['ld_score'], ld_score_expr=ht['ld_score'], chi_sq_exprs=[ht['chi_squared_50_irnt'], ht['chi_squared_20160']], n_samples_exprs=[ht['n_50_irnt'], ht['n_20160']], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results[0]['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results[0]['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results[0]['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results[0]['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results[0]['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results[1]['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results[1]['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results[1]['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results[1]['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results[1]['snp_heritability_standard_error'], 0.0416, places=4)
def explode_trio_matrix(tm: hl.MatrixTable, col_keys: List[str] = ['s'], keep_trio_cols: bool = True, keep_trio_entries: bool = False) -> hl.MatrixTable: """Splits a trio MatrixTable back into a sample MatrixTable. Example ------- >>> # Create a trio matrix from a sample matrix >>> pedigree = hl.Pedigree.read('data/case_control_study.fam') >>> trio_dataset = hl.trio_matrix(dataset, pedigree, complete_trios=True) >>> # Explode trio matrix back into a sample matrix >>> exploded_trio_dataset = explode_trio_matrix(trio_dataset) Notes ----- The resulting MatrixTable column schema is the same as the proband/father/mother schema, and the resulting entry schema is the same as the proband_entry/father_entry/mother_entry schema. If the `keep_trio_cols` option is set, then an additional `source_trio` column is added with the trio column data. If the `keep_trio_entries` option is set, then an additional `source_trio_entry` column is added with the trio entry data. Note ---- This assumes that the input MatrixTable is a trio MatrixTable (similar to the result of :meth:`.methods.trio_matrix`) Its entry schema has to contain 'proband_entry`, `father_entry` and `mother_entry` all with the same type. Its column schema has to contain 'proband`, `father` and `mother` all with the same type. Parameters ---------- tm : :class:`.MatrixTable` Trio MatrixTable (entries have to be a Struct with `proband_entry`, `mother_entry` and `father_entry` present) col_keys : :obj:`list` of str Column key(s) for the resulting sample MatrixTable keep_trio_cols: bool Whether to add a `source_trio` column with the trio column data (default `True`) keep_trio_entries: bool Whether to add a `source_trio_entries` column with the trio entry data (default `False`) Returns ------- :class:`.MatrixTable` Sample MatrixTable""" select_entries_expr = {'__trio_entries': hl.array([tm.proband_entry, tm.father_entry, tm.mother_entry])} if keep_trio_entries: select_entries_expr['source_trio_entry'] = hl.struct(**tm.entry) tm = tm.select_entries(**select_entries_expr) tm = tm.key_cols_by() select_cols_expr = {'__trio_members': hl.zip_with_index(hl.array([tm.proband, tm.father, tm.mother]))} if keep_trio_cols: select_cols_expr['source_trio'] = hl.struct(**tm.col) tm = tm.select_cols(**select_cols_expr) mt = tm.explode_cols(tm.__trio_members) mt = mt.transmute_entries( **mt.__trio_entries[mt.__trio_members[0]] ) mt = mt.key_cols_by() mt = mt.transmute_cols(**mt.__trio_members[1]) if col_keys: mt = mt.key_cols_by(*col_keys) return mt
def fields_to_array(ds, fields): return hl.flatten(hl.array([field_to_array(ds, f) for f in fields]))