def summary_ccr(ht_ccr: hl.Table, file_output: str, ccr_pct_start: int = 0, ccr_pct_end: int = 100, ccr_pct_bins: int = 10, cumulative_histogram: bool = False, ccr_pct_cutoffs=None) -> None: """ Summarize Coding Constrain Region information (as histogram) per gene. :param ht_ccr: CCR Hail table :param file_output: File output path :param ccr_pct_start: Start of histogram range. :param ccr_pct_end: End of histogram range :param ccr_pct_bins: Number of bins :param cumulative_histogram: Generate a cumulative histogram (rather than to use bins) :param ccr_pct_cutoffs: Cut-offs used to generate the cumulative histogram :return: None """ if ccr_pct_cutoffs is None: ccr_pct_cutoffs = [90, 95, 99] if cumulative_histogram: # generate cumulative counts histogram summary_tb = (ht_ccr .group_by('gene') .aggregate(**{'ccr_above_' + str(ccr_pct_cutoffs[k]): agg.filter(ht_ccr.ccr_pct >= ccr_pct_cutoffs[k], agg.count()) for k in range(0, len(ccr_pct_cutoffs))}) ) else: summary_tb = (ht_ccr .group_by('gene') .aggregate(ccr_bins=agg.hist(ht_ccr.ccr_pct, ccr_pct_start, ccr_pct_end, ccr_pct_bins)) ) # get bin edges as list (expected n_bins + 1) bin_edges = summary_tb.aggregate(agg.take(summary_tb.ccr_bins.bin_edges, 1))[0] # unpack array structure and annotate as individual fields summary_tb = (summary_tb .annotate(**{'ccr_bin_' + str(bin_edges[k]) + '_' + str(bin_edges[k + 1]): summary_tb.ccr_bins.bin_freq[k] for k in range(0, len(bin_edges) - 1)}) .flatten() ) # drop fields fields_to_drop = ['ccr_bins.bin_edges', 'ccr_bins.bin_freq'] summary_tb = (summary_tb .drop(*fields_to_drop) ) # Export summarized table (summary_tb .export(output=file_output) )
def test_aggregate2(self): schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32) rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3}, {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict( kt.group_by(status=kt.status) .aggregate( x1=agg.collect(kt.qPheno * 2), x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]), x3=agg.min(kt.qPheno), x4=agg.max(kt.qPheno), x5=agg.sum(kt.qPheno), x6=agg.product(hl.int64(kt.qPheno)), x7=agg.count(), x8=agg.count_where(kt.qPheno == 3), x9=agg.fraction(kt.qPheno == 1), x10=agg.stats(hl.float64(kt.qPheno)), x11=agg.hardy_weinberg_test(kt.GT), x13=agg.inbreeding(kt.GT, 0.1), x14=agg.call_stats(kt.GT, ["A", "T"]), x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0], x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0], x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))), x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))), x19=agg.take(kt.GT, 1, ordering=-kt.qPheno) ).take(1)[0]) expected = {u'status': 0, u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777, u'observed_homs': 1}, u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]}, u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'}, u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0}, u'x8': 1, u'x9': 0.0, u'x16': u'apple', u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5}, u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16, u'x17': [], u'x18': [], u'x19': [hl.Call([0, 1])]} self.maxDiff = None self.assertDictEqual(result, expected)