Example #1
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
Example #2
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{'status': 0, 'GT': hl.Call([0, 0]), 'qPheno': 3},
                {'status': 0, 'GT': hl.Call([0, 1]), 'qPheno': 13}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status)
                .aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.explode(lambda elt: agg.collect(elt), [kt.qPheno, kt.qPheno + 1]),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tarray(hl.tint32))),
                x18=agg.explode(lambda elt: agg.collect(elt), hl.null(hl.tset(hl.tint32))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)
            ).take(1)[0])

        expected = {u'status': 0,
                    u'x13': {u'n_called': 2, u'expected_homs': 1.64, u'f_stat': -1.777777777777777,
                             u'observed_homs': 1},
                    u'x14': {u'AC': [3, 1], u'AF': [0.75, 0.25], u'AN': 4, u'homozygote_count': [1, 0]},
                    u'x15': {u'a': 5, u'c': {u'banana': u'apple'}, u'b': u'foo'},
                    u'x10': {u'min': 3.0, u'max': 13.0, u'sum': 16.0, u'stdev': 5.0, u'n': 2, u'mean': 8.0},
                    u'x8': 1, u'x9': 0.0, u'x16': u'apple',
                    u'x11': {u'het_freq_hwe': 0.5, u'p_value': 0.5},
                    u'x2': [3, 4, 13, 14], u'x3': 3, u'x1': [6, 26], u'x6': 39, u'x7': 2, u'x4': 13, u'x5': 16,
                    u'x17': [],
                    u'x18': [],
                    u'x19': [hl.Call([0, 1])]}

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Example #3
0
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True)
burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)
Example #4
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id',
                       'f4': 'is_female', 'f5': 'pheno'}
        bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position',
                       'f3': 'position', 'f4': 'a1', 'f5': 'a2'}

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") &
                                 (fam1.mat_id == "0") & (fam1.is_female == "0") &
                                 (fam1.pheno == "NA")))
        self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) &
                                 (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, pheno=False)
        fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") &
                                 (fam2.mat_id == "nada") & (fam2.is_female == "2") &
                                 (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") &
                                 (fam3.mat_id == "0") & (fam3.is_female == "0") &
                                 (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError, "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
Example #5
0
def parse_as_doubles(string, has_non_ref):
    ints = string.split(r'\|')
    ints = hl.cond(has_non_ref, ints[:-1], ints)
    return ints.map(lambda i: hl.cond(
        (hl.len(i) == 0) | (i == '.'), hl.null(hl.tfloat64), hl.float64(i)))
Example #6
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint('output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.missing(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl.nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint('output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
Example #7
0
def _to_expr(e, dtype):
    if e is None:
        return None
    elif isinstance(e, Expression):
        if e.dtype != dtype:
            assert is_numeric(dtype), 'expected {}, got {}'.format(
                dtype, e.dtype)
            if dtype == tfloat64:
                return hl.float64(e)
            elif dtype == tfloat32:
                return hl.float32(e)
            elif dtype == tint64:
                return hl.int64(e)
            else:
                assert dtype == tint32
                return hl.int32(e)
        return e
    elif not is_compound(dtype):
        # these are not container types and cannot contain expressions if we got here
        return e
    elif isinstance(dtype, tstruct):
        new_fields = []
        found_expr = False
        for f, t in dtype.items():
            value = _to_expr(e[f], t)
            found_expr = found_expr or isinstance(value, Expression)
            new_fields.append(value)

        if not found_expr:
            return e
        else:
            exprs = [
                new_fields[i] if isinstance(new_fields[i], Expression) else
                hl.literal(new_fields[i], dtype[i])
                for i in range(len(new_fields))
            ]
            fields = {name: expr for name, expr in zip(dtype.keys(), exprs)}
            from .typed_expressions import StructExpression
            return StructExpression._from_fields(fields)

    elif isinstance(dtype, tarray):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert (len(elements) > 0)
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
        x = ir.MakeArray([e._ir for e in exprs], None)
        return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tset):
        elements = []
        found_expr = False
        for element in e:
            value = _to_expr(element, dtype.element_type)
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            assert (len(elements) > 0)
            exprs = [
                element if isinstance(element, Expression) else hl.literal(
                    element, dtype.element_type) for element in elements
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.ToSet(
                ir.ToStream(ir.MakeArray([e._ir for e in exprs], None)))
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, ttuple):
        elements = []
        found_expr = False
        assert len(e) == len(dtype.types)
        for i in range(len(e)):
            value = _to_expr(e[i], dtype.types[i])
            found_expr = found_expr or isinstance(value, Expression)
            elements.append(value)
        if not found_expr:
            return e
        else:
            exprs = [
                elements[i] if isinstance(elements[i], Expression) else
                hl.literal(elements[i], dtype.types[i])
                for i in range(len(elements))
            ]
            indices, aggregations = unify_all(*exprs)
            x = ir.MakeTuple([expr._ir for expr in exprs])
            return expressions.construct_expr(x, dtype, indices, aggregations)
    elif isinstance(dtype, tdict):
        keys = []
        values = []
        found_expr = False
        for k, v in e.items():
            k_ = _to_expr(k, dtype.key_type)
            v_ = _to_expr(v, dtype.value_type)
            found_expr = found_expr or isinstance(k_, Expression)
            found_expr = found_expr or isinstance(v_, Expression)
            keys.append(k_)
            values.append(v_)
        if not found_expr:
            return e
        else:
            assert len(keys) > 0
            # Here I use `to_expr` to call `lit` the keys and values separately.
            # I anticipate a common mode is statically-known keys and Expression
            # values.
            key_array = to_expr(keys, tarray(dtype.key_type))
            value_array = to_expr(values, tarray(dtype.value_type))
            return hl.dict(hl.zip(key_array, value_array))
    elif isinstance(dtype, hl.tndarray):
        return hl.nd.array(e)
    else:
        raise NotImplementedError(dtype)
def dmg_case_control_filter(mt,chrm='0',start=0,end=0,out_dir=os.getcwd(),name=""):
	"""Filters matrix table that has been annotated with annovar for case control test.""" 
	#convert chrm number to string if input is integer
	chrm = str(chrm)
	
	if name == "":
		name = "case_control_filtering_{}_{}-{}".format(chrm,start,end)
	
	child_dir = os.path.join(out_dir,"case_control_filtering",name)
	os.makedirs(child_dir)
	
	out_f = os.path.join(child_dir,"case_control_filtering_{}_{}-{}.out".format(chrm,start,end))
	with open(out_f, "w") as f:
		#Import matrix table already annotated with bravo and MetaSVM (In this case, using annovar)
		mt = hl.read_matrix_table(mt)
		#Make annotations easier for filtering. If no bravo frequency for variants, set bravo freq to 0
		mt = mt.annotate_rows(bravo=hl.cond(hl.is_defined(mt.info["bravo"][0]), hl.float64(mt.info["bravo"][0]), 0.0))
		mt = mt.annotate_rows(meta_svm_pred=mt.info.MetaSVM_pred[0])

		db_ht = hl.read_table('/gpfs/ycga/project/kahle/sp2349/datasets/dbNSFP/dbNSFp4.1a/dbNSFP4.1a_chr_all_GRCh37.ht')

		#Annotate mt with CADD16 scores. Original vcf was annotated with CADD13
		mt = mt.annotate_rows(cadd16=hl.cond(hl.is_defined(db_ht[mt.row_key]), db_ht[mt.row_key].CADD_phred_hg19, 0.0))

		#Subset probands for case_control

		# Step 1: Create a text file with the sample IDs you want to keep and import that text file as a hail table. 
		table = (hl.import_table('/gpfs/ycga/project/kahle/sp2349/moyamoya/case_control/cc_output/final_probands_for_caseControl.txt', impute=True).key_by('Sample'))

		#The IDs_keep.txt file has the following format (including headers)
		# Sample        should_retain
		# 1-00005       yes
		# 1-00187       yes
		# 1-00252       yes
		# 1-00386       yes
		# 1-00668       yes
		# etc etc

		# Annotate columns of matrix table with Sample-IDs you want to keep
		mt = mt.annotate_cols(is_retain = table[mt.s])
		mt = mt.annotate_cols(should_retain = table[mt.s].should_retain)


		# Filter matrix table columns 
		mt = mt.filter_cols(mt.col.is_retain.should_retain == 'yes', keep=True)
		mt = mt.filter_cols(mt.should_retain == 'yes', keep=True)
		
		sample_count = mt.cols().count()
		print("Sample count: {}".format(sample_count),file=f)
		print("Total allele count: {}".format(sample_count*2),file=f)

		#Filter on Bravo frequency
		mt_filtered = mt.filter_rows(mt.bravo <= 0.0005)

		#Filter on DIAPH1 coordinates
		mt_filtered = mt_filtered.filter_rows((mt_filtered.locus >= hl.locus(chrm,start)) & (mt_filtered.locus <= hl.locus(chrm,end)))

		print("Unique variants post bravo, CADD, and MetaSVM: {}".format(mt_filtered.rows().count()),file=f)

		#Filter for exonic and splice-site variants only
		mt_filtered = mt_filtered.filter_rows((mt_filtered.vep.most_severe_consequence == "stop_gained") | 
											  (mt_filtered.vep.most_severe_consequence == "splice_acceptor_variant") | 
											  (mt_filtered.vep.most_severe_consequence =="splice_donor_variant") | 
											  (mt_filtered.vep.most_severe_consequence == "frameshift_variant") | 
											  (mt_filtered.vep.most_severe_consequence =="stop_lost") | 
											  (mt_filtered.vep.most_severe_consequence =="start_lost") | 
											  ((mt_filtered.vep.most_severe_consequence =='missense_variant') & (mt_filtered.cadd16 >=20)) |
											  ((mt_filtered.vep.most_severe_consequence =='missense_variant') & (mt_filtered.meta_svm_pred == "D")) |
											  ((mt_filtered.vep.most_severe_consequence =='protein_altering_variant') & (mt_filtered.cadd16 >=20)) |
											  ((mt_filtered.vep.most_severe_consequence =='protein_altering_variant') & (mt_filtered.meta_svm_pred == "D")))
		print("Variants: {} kept".format(mt_filtered.count()))

		mt_filtered.count()

		#Convert matrix table to table for easier dropping of homozygous reference samples
		mt_filtered = mt_filtered.key_cols_by()
		mt_filtered_table = mt_filtered.entries()

		#Filter samples with homozygous reference calls (WT)
		mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.GT.is_hom_ref() == True, keep=False)
		#mt_filtered_table.show()

		#Filter samples on GQ >= 20 and DP >= 8
		mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.DP > 9)
		mt_filtered_table = mt_filtered_table.filter(mt_filtered_table.GQ > 19)
		
		#Write to matrix table
		mt_filtered_table.write(os.path.join(out_dir,'Damaging_Cases_chr{}_{}-{}.ht'.format(chrm,start,end)),overwrite=True)
		
		print("Total Variants post filtering: {}".format(mt_filtered_table.count()),file=f)
		
		print("Total Cases (Alleles): {}".format(mt_filtered_table.aggregate(hl.agg.sum(mt_filtered_table.GT.n_alt_alleles()))),file=f)
		print("Total Homozygous Cases: {}".format(mt_filtered_table.aggregate(hl.agg.count_where(mt_filtered_table.GT.is_hom_var() == True))),file=f)
		print("Samples with variants: {}".format(mt_filtered_table.aggregate(hl.agg.counter(mt_filtered_table.s))),file=f)
	
		#Write to text file
		df = mt_filtered_table.to_pandas()
		df.to_csv(os.path.join(out_dir,'Damaging_Cases_chr{}_{}-{}_table.txt'.format(chrm,start,end)),sep="\t")
Example #9
0
    def test_aggregate2(self):
        schema = hl.tstruct(status=hl.tint32, GT=hl.tcall, qPheno=hl.tint32)

        rows = [{
            'status': 0,
            'GT': hl.Call([0, 0]),
            'qPheno': 3
        }, {
            'status': 0,
            'GT': hl.Call([0, 1]),
            'qPheno': 13
        }]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(
            kt.group_by(status=kt.status).aggregate(
                x1=agg.collect(kt.qPheno * 2),
                x2=agg.collect(agg.explode([kt.qPheno, kt.qPheno + 1])),
                x3=agg.min(kt.qPheno),
                x4=agg.max(kt.qPheno),
                x5=agg.sum(kt.qPheno),
                x6=agg.product(hl.int64(kt.qPheno)),
                x7=agg.count(),
                x8=agg.count_where(kt.qPheno == 3),
                x9=agg.fraction(kt.qPheno == 1),
                x10=agg.stats(hl.float64(kt.qPheno)),
                x11=agg.hardy_weinberg_test(kt.GT),
                x13=agg.inbreeding(kt.GT, 0.1),
                x14=agg.call_stats(kt.GT, ["A", "T"]),
                x15=agg.collect(
                    hl.Struct(a=5, b="foo", c=hl.Struct(banana='apple')))[0],
                x16=agg.collect(
                    hl.Struct(a=5, b="foo",
                              c=hl.Struct(banana='apple')).c.banana)[0],
                x17=agg.collect(agg.explode(hl.null(hl.tarray(hl.tint32)))),
                x18=agg.collect(agg.explode(hl.null(hl.tset(hl.tint32)))),
                x19=agg.take(kt.GT, 1, ordering=-kt.qPheno)).take(1)[0])

        expected = {
            u'status': 0,
            u'x13': {
                u'n_called': 2,
                u'expected_homs': 1.64,
                u'f_stat': -1.777777777777777,
                u'observed_homs': 1
            },
            u'x14': {
                u'AC': [3, 1],
                u'AF': [0.75, 0.25],
                u'AN': 4,
                u'homozygote_count': [1, 0]
            },
            u'x15': {
                u'a': 5,
                u'c': {
                    u'banana': u'apple'
                },
                u'b': u'foo'
            },
            u'x10': {
                u'min': 3.0,
                u'max': 13.0,
                u'sum': 16.0,
                u'stdev': 5.0,
                u'n': 2,
                u'mean': 8.0
            },
            u'x8': 1,
            u'x9': 0.0,
            u'x16': u'apple',
            u'x11': {
                u'het_freq_hwe': 0.5,
                u'p_value': 0.5
            },
            u'x2': [3, 4, 13, 14],
            u'x3': 3,
            u'x1': [6, 26],
            u'x6': 39,
            u'x7': 2,
            u'x4': 13,
            u'x5': 16,
            u'x17': [],
            u'x18': [],
            u'x19': [hl.Call([0, 1])]
        }

        self.maxDiff = None

        self.assertDictEqual(result, expected)
Example #10
0
for phen in gwas_phens:
    print('####################')
    print('Starting phen ' + phen)
    print('####################')
    phen_starttime = datetime.datetime.now()

    phen_tb = tb.select(tb['"' + phen + '"']).rename(
        {'"' + phen + '"': 'phen'})
    mt1 = variants.annotate_cols(
        phen_str=hl.str(phen_tb[variants.s]['phen']).replace('\"', ''))
    mt1 = mt1.filter_cols(mt1.phen_str == '', keep=False)
    if phen_tb.phen.dtype == hl.dtype('bool'):
        mt1 = mt1.annotate_cols(phen=hl.bool(mt1.phen_str)).drop('phen_str')
    else:
        mt1 = mt1.annotate_cols(phen=hl.float64(mt1.phen_str)).drop('phen_str')

    for sex in ['female', 'male']:
        mt2 = mt1.filter_cols(mt1.isFemale == (sex == 'female'))

        mt3 = mt2.add_col_index()
        mt3 = mt3.rename({'dosage': 'x', 'phen': 'y'})

        n_samples = mt3.count_cols()
        print('\n>>> phen ' + sex + ' ' + phen + ': N samples = ' +
              str(n_samples) + ' <<<')

        mt = mt3

        cov_list = [
            mt['isFemale'], mt['age'], mt['age_squared'], mt['age_isFemale'],
Example #11
0
def compute_ranked_bin(
    ht: hl.Table,
    score_expr: hl.expr.NumericExpression,
    bin_expr: Dict[str, hl.expr.BooleanExpression] = {"bin": True},
    compute_snv_indel_separately: bool = True,
    n_bins: int = 100,
    desc: bool = True,
) -> hl.Table:
    r"""
    Return a table with a bin for each row based on the ranking of `score_expr`.

    The bin is computed by dividing the `score_expr` into `n_bins` bins containing approximately equal numbers of elements.
    This is done by ranking the rows by `score_expr` (and a random number in cases where multiple variants have the same score)
    and then assigning the variant to a bin based on its ranking.

    If `compute_snv_indel_separately` is True all items in `bin_expr` will be stratified by snv / indels for the ranking and
    bin calculation. Because SNV and indel rows are mutually exclusive, they are re-combined into a single annotation. For
    example if we have the following four variants and scores and `n_bins` of 2:

    ========   =======   ======   =================   =================
    Variant    Type      Score    bin - `compute_snv_indel_separately`:
    --------   -------   ------   -------------------------------------
    \          \         \        False               True
    ========   =======   ======   =================   =================
    Var1       SNV       0.1      1                   1
    Var2       SNV       0.2      1                   2
    Var3       Indel     0.3      2                   1
    Var4       Indel     0.4      2                   2
    ========   =======   ======   =================   =================

    .. note::

        The `bin_expr` defines which data the bin(s) should be computed on. E.g., to get biallelic specific binning
        and singleton specific binning, the following could be used:

        .. code-block:: python

            bin_expr={
                'biallelic_bin': ~ht.was_split,
                'singleton_bin': ht.singleton
            }

    :param ht: Input Table
    :param score_expr: Expression containing the score
    :param bin_expr: Specific row grouping(s) to perform ranking and binning on (see note)
    :param compute_snv_indel_separately: Should all `bin_expr` items be stratified by SNVs / indels
    :param n_bins: Number of bins to bin the data into
    :param desc: Whether to bin the score in descending order
    :return: Table with the requested bin annotations
    """
    if compute_snv_indel_separately:
        # For each bin, add a SNV / indel stratification
        bin_expr = {
            f"{bin_id}_{snv}": (bin_expr & snv_expr)
            for bin_id, bin_expr in bin_expr.items() for snv, snv_expr in [
                ("snv", hl.is_snp(ht.alleles[0], ht.alleles[1])),
                ("indel", ~hl.is_snp(ht.alleles[0], ht.alleles[1])),
            ]
        }

    bin_ht = ht.select(
        **{
            f"_filter_{bin_id}": bin_expr
            for bin_id, bin_expr in bin_expr.items()
        },
        _score=score_expr,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        _rand=hl.rand_unif(0, 1),
    )

    logger.info(
        "Sorting the HT by score_expr followed by a random float between 0 and 1. "
        "Then adding a row index per grouping defined by bin_expr...")
    bin_ht = bin_ht.order_by("_score", "_rand")
    bin_ht = bin_ht.annotate(
        **{
            f"{bin_id}_rank": hl.or_missing(
                bin_ht[f"_filter_{bin_id}"],
                hl.scan.count_where(bin_ht[f"_filter_{bin_id}"]),
            )
            for bin_id in bin_expr
        })
    bin_ht = bin_ht.key_by("locus", "alleles")

    # Annotate globals with variant counts per group defined by bin_expr. This is used to determine bin assignment
    bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=bin_ht.aggregate(
        hl.Struct(
            **{
                bin_id: hl.agg.filter(
                    bin_ht[f"_filter_{bin_id}"],
                    hl.agg.count(),
                )
                for bin_id in bin_expr
            })))

    logger.info("Binning ranked rows into %d bins...", n_bins)
    bin_ht = bin_ht.select(
        "snv",
        **{
            bin_id: hl.int(
                hl.floor(
                    (n_bins *
                     (bin_ht[f"{bin_id}_rank"] /
                      hl.float64(bin_ht.bin_group_variant_counts[bin_id]))) +
                    1))
            for bin_id in bin_expr
        },
    )

    if desc:
        bin_ht = bin_ht.annotate(
            **{bin_id: n_bins - bin_ht[bin_id] + 1
               for bin_id in bin_expr})

    # Because SNV and indel rows are mutually exclusive, re-combine them into a single bin.
    # Update the global bin_group_variant_counts struct to reflect the change in bin names in the table
    if compute_snv_indel_separately:
        bin_expr_no_snv = {
            bin_id.rsplit("_", 1)[0]
            for bin_id in bin_ht.bin_group_variant_counts
        }
        bin_ht = bin_ht.annotate_globals(bin_group_variant_counts=hl.struct(
            **{
                bin_id: hl.struct(
                    **{
                        snv: bin_ht.bin_group_variant_counts[f"{bin_id}_{snv}"]
                        for snv in ["snv", "indel"]
                    })
                for bin_id in bin_expr_no_snv
            }))

        bin_ht = bin_ht.transmute(
            **{
                bin_id: hl.if_else(
                    bin_ht.snv,
                    bin_ht[f"{bin_id}_snv"],
                    bin_ht[f"{bin_id}_indel"],
                )
                for bin_id in bin_expr_no_snv
            })

    return bin_ht
Example #12
0
def sample_qc(mt, name='sample_qc') -> MatrixTable:
    """Compute per-sample metrics useful for quality control.

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    Compute sample QC metrics and remove low-quality samples:

    >>> dataset = hl.sample_qc(dataset, name='sample_qc')
    >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5))

    Notes
    -----

    This method computes summary statistics per sample from a genetic matrix and stores
    the results as a new column-indexed struct field in the matrix, named based on the
    `name` parameter.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `call_rate` (``float64``) -- Fraction of calls not missing or filtered.
      Equivalent to `n_called` divided by :meth:`.count_rows`.
    - `n_called` (``int64``) -- Number of non-missing calls.
    - `n_not_called` (``int64``) -- Number of missing calls.
    - `n_filtered` (``int64``) -- Number of filtered entries.
    - `n_hom_ref` (``int64``) -- Number of homozygous reference calls.
    - `n_het` (``int64``) -- Number of heterozygous calls.
    - `n_hom_var` (``int64``) -- Number of homozygous alternate calls.
    - `n_non_ref` (``int64``) -- Sum of `n_het` and `n_hom_var`.
    - `n_snp` (``int64``) -- Number of SNP alternate alleles.
    - `n_insertion` (``int64``) -- Number of insertion alternate alleles.
    - `n_deletion` (``int64``) -- Number of deletion alternate alleles.
    - `n_singleton` (``int64``) -- Number of private alleles.
    - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles.
    - `n_transversion` (``int64``) -- Number of transversion alternate alleles.
    - `n_star` (``int64``) -- Number of star (upstream deletion) alleles.
    - `r_ti_tv` (``float64``) -- Transition/Transversion ratio.
    - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio.
    - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio.

    Missing values ``NA`` may result from division by zero.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new column-indexed field `name`.
    """

    require_row_key_variant(mt, 'sample_qc')

    from hail.expr.functions import _num_allele_type, _allele_types

    allele_types = _allele_types[:]
    allele_types.extend(['Transition', 'Transversion'])
    allele_enum = {i: v for i, v in enumerate(allele_types)}
    allele_ints = {v: k for k, v in allele_enum.items()}

    def allele_type(ref, alt):
        return hl.bind(
            lambda at: hl.cond(
                at == allele_ints['SNP'],
                hl.cond(hl.is_transition(ref, alt), allele_ints['Transition'],
                        allele_ints['Transversion']), at),
            _num_allele_type(ref, alt))

    variant_ac = Env.get_uid()
    variant_atypes = Env.get_uid()
    mt = mt.annotate_rows(
        **{
            variant_ac:
            hl.agg.call_stats(mt.GT, mt.alleles).AC,
            variant_atypes:
            mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))
        })

    bound_exprs = {}
    gq_dp_exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        gq_dp_exprs['dp_stats'] = hl.agg.stats(mt.DP).select(
            'mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        gq_dp_exprs['gq_stats'] = hl.agg.stats(mt.GQ).select(
            'mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT', hl.tcall):
        raise ValueError(
            f"'sample_qc': expect an entry field 'GT' of type 'call'")

    bound_exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    bound_exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))

    n_rows_ref = hl.expr.construct_expr(
        hl.ir.Ref('n_rows'), hl.tint64, mt._col_indices,
        hl.utils.LinkedList(hl.expr.Aggregation))
    bound_exprs['n_filtered'] = n_rows_ref - hl.agg.count()
    bound_exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref())
    bound_exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het())
    bound_exprs['n_singleton'] = hl.agg.sum(
        hl.sum(
            hl.range(0, mt['GT'].ploidy).map(
                lambda i: mt[variant_ac][mt['GT'][i]] == 1)))

    def get_allele_type(allele_idx):
        return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1],
                       hl.null(hl.tint32))

    bound_exprs['allele_type_counts'] = hl.agg.explode(
        lambda elt: hl.agg.counter(elt),
        hl.range(0,
                 mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i])))

    zero = hl.int64(0)

    result_struct = hl.rbind(hl.struct(**bound_exprs),
        lambda x: hl.rbind(
            hl.struct(**{
                **gq_dp_exprs,
                'call_rate': hl.float64(x.n_called) / (x.n_called + x.n_not_called + x.n_filtered),
                'n_called': x.n_called,
                'n_not_called': x.n_not_called,
                'n_filtered': x.n_filtered,
                'n_hom_ref': x.n_hom_ref,
                'n_het': x.n_het,
                'n_hom_var': x.n_called - x.n_hom_ref - x.n_het,
                'n_non_ref': x.n_called - x.n_hom_ref,
                'n_singleton': x.n_singleton,
                'n_snp': x.allele_type_counts.get(allele_ints["Transition"], zero) + \
                         x.allele_type_counts.get(allele_ints["Transversion"], zero),
                'n_insertion': x.allele_type_counts.get(allele_ints["Insertion"], zero),
                'n_deletion': x.allele_type_counts.get(allele_ints["Deletion"], zero),
                'n_transition': x.allele_type_counts.get(allele_ints["Transition"], zero),
                'n_transversion': x.allele_type_counts.get(allele_ints["Transversion"], zero),
                'n_star': x.allele_type_counts.get(allele_ints["Star"], zero)
            }),
            lambda s: s.annotate(
                r_ti_tv=divide_null(hl.float64(s.n_transition), s.n_transversion),
                r_het_hom_var=divide_null(hl.float64(s.n_het), s.n_hom_var),
                r_insertion_deletion=divide_null(hl.float64(s.n_insertion), s.n_deletion)
            )))

    mt = mt.annotate_cols(**{name: result_struct})
    mt = mt.drop(variant_ac, variant_atypes)

    return mt
Example #13
0
    global_field_1=5,
    global_field_2=10,
    pli={
        'SCN1A': 0.999,
        'SONIC': 0.014
    },
    populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(
    hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(
    use_in_kinship=lmmreg_ds.variant_qc.AF > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv',
                            key='Sample',
                            impute=True)
burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(
    weight=hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds')
Example #14
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {
            'f0': 'fam_id',
            'f1': 'ind_id',
            'f2': 'pat_id',
            'f3': 'mat_id',
            'f4': 'is_female',
            'f5': 'pheno'
        }
        bim_mapping = {
            'f0': 'contig',
            'f1': 'varid',
            'f2': 'cm_position',
            'f3': 'position',
            'f4': 'a1',
            'f5': 'a2'
        }

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0")
                     & (fam1.mat_id == "0") & (fam1.is_female == "0")
                     & (fam1.pheno == "NA")))
        self.assertTrue(
            bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" +
                      bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds,
                        out2,
                        ind_id=ds.s,
                        fam_id=ds.s,
                        pat_id="nope",
                        mat_id="nada",
                        is_female=True,
                        pheno=False)
        fam2 = (hl.import_table(out2 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope")
                     & (fam2.mat_id == "nada") & (fam2.is_female == "2")
                     & (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0")
                     & (fam3.mat_id == "0") & (fam3.is_female == "0")
                     & (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim',
                                         out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(
            hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError,
                                    "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
Example #15
0
def sample_qc(vds: 'VariantDataset',
              *,
              gq_bins: 'Sequence[int]' = (0, 20, 60),
              dp_bins: 'Sequence[int]' = (0, 1, 10, 20, 30),
              dp_field=None) -> 'Table':
    """Compute sample quality metrics about a :class:`.VariantDataset`.

    If the `dp_field` parameter is not specified, the ``DP`` is used for depth
    if present. If no ``DP`` field is present, the ``MIN_DP`` field is used. If no ``DP``
    or ``MIN_DP`` field is present, no depth statistics will be calculated.

    Parameters
    ----------
    vds : :class:`.VariantDataset`
        Dataset in VariantDataset representation.
    name : :obj:`str`
        Name for resulting field.
    gq_bins : :class:`tuple` of :obj:`int`
        Tuple containing cutoffs for genotype quality (GQ) scores.
    dp_bins : :class:`tuple` of :obj:`int`
        Tuple containing cutoffs for depth (DP) scores.
    dp_field : :obj:`str`
        Name of depth field. If not supplied, DP or MIN_DP will be used, in that order.

    Returns
    -------
    :class:`.Table`
        Hail Table of results, keyed by sample.
    """

    require_first_key_field_locus(vds.reference_data, 'sample_qc')
    require_first_key_field_locus(vds.variant_data, 'sample_qc')

    ref = vds.reference_data

    if 'DP' in ref.entry:
        ref_dp_field_to_use = 'DP'
    elif 'MIN_DP' in ref.entry:
        ref_dp_field_to_use = 'MIN_DP'
    else:
        ref_dp_field_to_use = dp_field

    from hail.expr.functions import _num_allele_type, _allele_types

    allele_types = _allele_types[:]
    allele_types.extend(['Transition', 'Transversion'])
    allele_enum = {i: v for i, v in enumerate(allele_types)}
    allele_ints = {v: k for k, v in allele_enum.items()}

    def allele_type(ref, alt):
        return hl.bind(
            lambda at: hl.if_else(
                at == allele_ints['SNP'],
                hl.if_else(hl.is_transition(ref, alt), allele_ints[
                    'Transition'], allele_ints['Transversion']), at),
            _num_allele_type(ref, alt))

    variant_ac = Env.get_uid()
    variant_atypes = Env.get_uid()

    vmt = vds.variant_data
    if 'GT' not in vmt.entry:
        vmt = vmt.annotate_entries(
            GT=hl.experimental.lgt_to_gt(vmt.LGT, vmt.LA))

    vmt = vmt.annotate_rows(
        **{
            variant_ac:
            hl.agg.call_stats(vmt.GT, vmt.alleles).AC,
            variant_atypes:
            vmt.alleles[1:].map(lambda alt: allele_type(vmt.alleles[0], alt))
        })

    bound_exprs = {}

    bound_exprs['n_het'] = hl.agg.count_where(vmt['GT'].is_het())
    bound_exprs['n_hom_var'] = hl.agg.count_where(vmt['GT'].is_hom_var())
    bound_exprs['n_singleton'] = hl.agg.sum(
        hl.rbind(
            vmt['GT'], lambda gt: hl.sum(
                hl.range(0, gt.ploidy).map(lambda i: hl.rbind(
                    gt[i], lambda gti:
                    (gti != 0) & (vmt[variant_ac][gti] == 1))))))

    bound_exprs['allele_type_counts'] = hl.agg.explode(
        lambda allele_type: hl.tuple(
            hl.agg.count_where(allele_type == i)
            for i in range(len(allele_ints))),
        (hl.range(0, vmt['GT'].ploidy).map(lambda i: vmt['GT'][i]).filter(
            lambda allele_idx: allele_idx > 0).map(
                lambda allele_idx: vmt[variant_atypes][allele_idx - 1])))

    dp_exprs = {}
    if ref_dp_field_to_use is not None and 'DP' in vmt.entry:
        dp_exprs['dp'] = hl.tuple(
            hl.agg.count_where(vmt.DP >= x) for x in dp_bins)

    gq_dp_exprs = hl.struct(
        **{'gq': hl.tuple(hl.agg.count_where(vmt.GQ >= x) for x in gq_bins)},
        **dp_exprs)

    result_struct = hl.rbind(
        hl.struct(**bound_exprs), lambda x: hl.rbind(
            hl.struct(
                **{
                    'gq_dp_exprs':
                    gq_dp_exprs,
                    'n_het':
                    x.n_het,
                    'n_hom_var':
                    x.n_hom_var,
                    'n_non_ref':
                    x.n_het + x.n_hom_var,
                    'n_singleton':
                    x.n_singleton,
                    'n_snp':
                    (x.allele_type_counts[allele_ints['Transition']] + x.
                     allele_type_counts[allele_ints['Transversion']]),
                    'n_insertion':
                    x.allele_type_counts[allele_ints['Insertion']],
                    'n_deletion':
                    x.allele_type_counts[allele_ints['Deletion']],
                    'n_transition':
                    x.allele_type_counts[allele_ints['Transition']],
                    'n_transversion':
                    x.allele_type_counts[allele_ints['Transversion']],
                    'n_star':
                    x.allele_type_counts[allele_ints['Star']]
                }), lambda s: s.annotate(r_ti_tv=divide_null(
                    hl.float64(s.n_transition), s.n_transversion),
                                         r_het_hom_var=divide_null(
                                             hl.float64(s.n_het), s.n_hom_var),
                                         r_insertion_deletion=divide_null(
                                             hl.float64(s.n_insertion), s.
                                             n_deletion))))
    variant_results = vmt.select_cols(**result_struct).cols()

    rmt = vds.reference_data

    ref_dp_expr = {}
    if ref_dp_field_to_use is not None:
        ref_dp_expr['ref_bases_over_dp_threshold'] = hl.tuple(
            hl.agg.filter(rmt[ref_dp_field_to_use] >= x,
                          hl.agg.sum(1 + rmt.END - rmt.locus.position))
            for x in dp_bins)
    ref_results = rmt.select_cols(ref_bases_over_gq_threshold=hl.tuple(
        hl.agg.filter(rmt.GQ >= x, hl.agg.sum(1 + rmt.END -
                                              rmt.locus.position))
        for x in gq_bins),
                                  **ref_dp_expr).cols()

    joined = ref_results[variant_results.key]

    joined_dp_expr = {}
    dp_bins_field = {}
    if ref_dp_field_to_use is not None:
        joined_dp_expr['bases_over_dp_threshold'] = hl.tuple(
            x + y for x, y in zip(variant_results.gq_dp_exprs.dp,
                                  joined.ref_bases_over_dp_threshold))
        dp_bins_field['dp_bins'] = hl.tuple(dp_bins)

    joined_results = variant_results.transmute(
        bases_over_gq_threshold=hl.tuple(
            x + y for x, y in zip(variant_results.gq_dp_exprs.gq,
                                  joined.ref_bases_over_gq_threshold)),
        **joined_dp_expr)

    joined_results = joined_results.annotate_globals(gq_bins=hl.tuple(gq_bins),
                                                     **dp_bins_field)
    return joined_results
Example #16
0
def maximal_independent_set(i,
                            j,
                            keep=True,
                            tie_breaker=None,
                            keyed=True) -> Table:
    """Return a table containing the vertices in a near
    `maximal independent set <https://en.wikipedia.org/wiki/Maximal_independent_set>`_
    of an undirected graph whose edges are given by a two-column table.

    Examples
    --------
    Run PC-relate and compute pairs of closely related individuals:

    >>> pc_rel = hl.pc_relate(dataset.GT, 0.001, k=2, statistics='kin')
    >>> pairs = pc_rel.filter(pc_rel['kin'] > 0.125)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain:

    >>> related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, False)
    >>> result = dataset.filter_cols(
    ...     hl.is_defined(related_samples_to_remove[dataset.col_key]), keep=False)

    Starting from the above pairs, prune individuals from a dataset until no
    close relationships remain, preferring to keep cases over controls:

    >>> samples = dataset.cols()
    >>> pairs_with_case = pairs.key_by(
    ...     i=hl.struct(id=pairs.i, is_case=samples[pairs.i].is_case),
    ...     j=hl.struct(id=pairs.j, is_case=samples[pairs.j].is_case))
    >>> def tie_breaker(l, r):
    ...     return hl.cond(l.is_case & ~r.is_case, -1,
    ...                    hl.cond(~l.is_case & r.is_case, 1, 0))
    >>> related_samples_to_remove = hl.maximal_independent_set(
    ...    pairs_with_case.i, pairs_with_case.j, False, tie_breaker)
    >>> result = dataset.filter_cols(hl.is_defined(
    ...     related_samples_to_remove.key_by(
    ...        s = related_samples_to_remove.node.id.s)[dataset.col_key]), keep=False)

    Notes
    -----

    The vertex set of the graph is implicitly all the values realized by `i`
    and `j` on the rows of this table. Each row of the table corresponds to an
    undirected edge between the vertices given by evaluating `i` and `j` on
    that row. An undirected edge may appear multiple times in the table and
    will not affect the output. Vertices with self-edges are removed as they
    are not independent of themselves.

    The expressions for `i` and `j` must have the same type.

    The value of `keep` determines whether the vertices returned are those
    in the maximal independent set, or those in the complement of this set.
    This is useful if you need to filter a table without removing vertices that
    don't appear in the graph at all.

    This method implements a greedy algorithm which iteratively removes a
    vertex of highest degree until the graph contains no edges. The greedy
    algorithm always returns an independent set, but the set may not always
    be perfectly maximal.

    `tie_breaker` is a Python function taking two arguments---say `l` and
    `r`---each of which is an :class:`Expression` of the same type as `i` and
    `j`. `tie_breaker` returns a :class:`NumericExpression`, which defines an
    ordering on nodes. A pair of nodes can be ordered in one of three ways, and
    `tie_breaker` must encode the relationship as follows:

     - if ``l < r`` then ``tie_breaker`` evaluates to some negative integer
     - if ``l == r`` then ``tie_breaker`` evaluates to 0
     - if ``l > r`` then ``tie_breaker`` evaluates to some positive integer

    For example, the usual ordering on the integers is defined by: ``l - r``.

    The `tie_breaker` function must satisfy the following property:
    ``tie_breaker(l, r) == -tie_breaker(r, l)``.

    When multiple nodes have the same degree, this algorithm will order the
    nodes according to ``tie_breaker`` and remove the *largest* node.

    If `keyed` is ``False``, then a node may appear twice in the resulting
    table.

    Parameters
    ----------
    i : :class:`.Expression`
        Expression to compute one endpoint of an edge.
    j : :class:`.Expression`
        Expression to compute another endpoint of an edge.
    keep : :obj:`bool`
        If ``True``, return vertices in set. If ``False``, return vertices removed.
    tie_breaker : function
        Function used to order nodes with equal degree.
    keyed : :obj:`bool`
        If ``True``, key the resulting table by the `node` field, this requires
        a sort.

    Returns
    -------
    :class:`.Table`
        Table with the set of independent vertices. The table schema is one row
        field `node` which has the same type as input expressions `i` and `j`.
    """

    if i.dtype != j.dtype:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to have same type. "
            "Found {} and {}.".format(i.dtype, j.dtype))

    source = i._indices.source
    if not isinstance(source, Table):
        raise ValueError(
            "'maximal_independent_set' expects an expression of 'Table'. Found {}"
            .format("expression of '{}'".format(source.__class__)
                    if source is not None else 'scalar expression'))

    if i._indices.source != j._indices.source:
        raise ValueError(
            "'maximal_independent_set' expects arguments `i` and `j` to be expressions of the same Table. "
            "Found\n{}\n{}".format(i, j))

    node_t = i.dtype

    if tie_breaker:
        wrapped_node_t = ttuple(node_t)
        l = construct_variable('l', wrapped_node_t)
        r = construct_variable('r', wrapped_node_t)
        tie_breaker_expr = hl.float64(tie_breaker(l[0], r[0]))
        t, _ = source._process_joins(i, j, tie_breaker_expr)
        tie_breaker_str = str(tie_breaker_expr._ir)
    else:
        t, _ = source._process_joins(i, j)
        tie_breaker_str = None

    edges = t.select(__i=i, __j=j).key_by().select('__i', '__j')
    edges_path = new_temp_file()
    edges.write(edges_path)
    edges = hl.read_table(edges_path)

    mis_nodes = construct_expr(
        JavaIR(Env.hail().utils.Graph.pyMaximalIndependentSet(
            Env.spark_backend('maximal_independent_set')._to_java_ir(
                edges.collect(_localize=False)._ir), node_t._parsable_string(),
            joption(tie_breaker_str))), hl.tset(node_t))

    nodes = edges.select(node=[edges.__i, edges.__j])
    nodes = nodes.explode(nodes.node)
    nodes = nodes.annotate_globals(mis_nodes=mis_nodes)
    nodes = nodes.filter(nodes.mis_nodes.contains(nodes.node), keep)
    nodes = nodes.select_globals()
    if keyed:
        return nodes.key_by('node').distinct()
    return nodes
Example #17
0
File: qc.py Project: tpoterba/hail
def sample_qc(mt, name='sample_qc') -> MatrixTable:
    """Compute per-sample metrics useful for quality control.

    .. include:: ../_templates/req_tvariant.rst

    Examples
    --------

    Compute sample QC metrics and remove low-quality samples:

    >>> dataset = hl.sample_qc(dataset, name='sample_qc')
    >>> filtered_dataset = dataset.filter_cols((dataset.sample_qc.dp_stats.mean > 20) & (dataset.sample_qc.r_ti_tv > 1.5))

    Notes
    -----

    This method computes summary statistics per sample from a genetic matrix and stores
    the results as a new column-indexed struct field in the matrix, named based on the
    `name` parameter.

    If `mt` contains an entry field `DP` of type :py:data:`.tint32`, then the
    field `dp_stats` is computed. If `mt` contains an entry field `GQ` of type
    :py:data:`.tint32`, then the field `gq_stats` is computed. Both `dp_stats`
    and `gq_stats` are structs with with four fields:

    - `mean` (``float64``) -- Mean value.
    - `stdev` (``float64``) -- Standard deviation (zero degrees of freedom).
    - `min` (``int32``) -- Minimum value.
    - `max` (``int32``) -- Maximum value.

    If the dataset does not contain an entry field `GT` of type
    :py:data:`.tcall`, then an error is raised. The following fields are always
    computed from `GT`:

    - `call_rate` (``float64``) -- Fraction of calls non-missing.
    - `n_called` (``int64``) -- Number of non-missing calls.
    - `n_not_called` (``int64``) -- Number of missing calls.
    - `n_hom_ref` (``int64``) -- Number of homozygous reference calls.
    - `n_het` (``int64``) -- Number of heterozygous calls.
    - `n_hom_var` (``int64``) -- Number of homozygous alternate calls.
    - `n_non_ref` (``int64``) -- Sum of ``n_het`` and ``n_hom_var``.
    - `n_snp` (``int64``) -- Number of SNP alternate alleles.
    - `n_insertion` (``int64``) -- Number of insertion alternate alleles.
    - `n_deletion` (``int64``) -- Number of deletion alternate alleles.
    - `n_singleton` (``int64``) -- Number of private alleles.
    - `n_transition` (``int64``) -- Number of transition (A-G, C-T) alternate alleles.
    - `n_transversion` (``int64``) -- Number of transversion alternate alleles.
    - `n_star` (``int64``) -- Number of star (upstream deletion) alleles.
    - `r_ti_tv` (``float64``) -- Transition/Transversion ratio.
    - `r_het_hom_var` (``float64``) -- Het/HomVar call ratio.
    - `r_insertion_deletion` (``float64``) -- Insertion/Deletion allele ratio.

    Missing values ``NA`` may result from division by zero.

    Parameters
    ----------
    mt : :class:`.MatrixTable`
        Dataset.
    name : :obj:`str`
        Name for resulting field.

    Returns
    -------
    :class:`.MatrixTable`
        Dataset with a new column-indexed field `name`.
    """

    require_row_key_variant(mt, 'sample_qc')

    from hail.expr.functions import _num_allele_type , _allele_types

    allele_types = _allele_types[:]
    allele_types.extend(['Transition', 'Transversion'])
    allele_enum = {i: v for i, v in enumerate(allele_types)}
    allele_ints = {v: k for k, v in allele_enum.items()}

    def allele_type(ref, alt):
        return hl.bind(lambda at: hl.cond(at == allele_ints['SNP'],
                                          hl.cond(hl.is_transition(ref, alt),
                                                  allele_ints['Transition'],
                                                  allele_ints['Transversion']),
                                          at),
                       _num_allele_type(ref, alt))

    variant_ac = Env.get_uid()
    variant_atypes = Env.get_uid()
    mt = mt.annotate_rows(**{variant_ac: hl.agg.call_stats(mt.GT, mt.alleles).AC,
                             variant_atypes: mt.alleles[1:].map(lambda alt: allele_type(mt.alleles[0], alt))})

    exprs = {}

    def has_field_of_type(name, dtype):
        return name in mt.entry and mt[name].dtype == dtype

    if has_field_of_type('DP', hl.tint32):
        exprs['dp_stats'] = hl.agg.stats(mt.DP).select('mean', 'stdev', 'min', 'max')

    if has_field_of_type('GQ', hl.tint32):
        exprs['gq_stats'] = hl.agg.stats(mt.GQ).select('mean', 'stdev', 'min', 'max')

    if not has_field_of_type('GT',  hl.tcall):
        raise ValueError(f"'sample_qc': expect an entry field 'GT' of type 'call'")

    exprs['n_called'] = hl.agg.count_where(hl.is_defined(mt['GT']))
    exprs['n_not_called'] = hl.agg.count_where(hl.is_missing(mt['GT']))
    exprs['n_hom_ref'] = hl.agg.count_where(mt['GT'].is_hom_ref())
    exprs['n_het'] = hl.agg.count_where(mt['GT'].is_het())
    exprs['n_singleton'] = hl.agg.sum(hl.sum(hl.range(0, mt['GT'].ploidy).map(lambda i: mt[variant_ac][mt['GT'][i]] == 1)))

    def get_allele_type(allele_idx):
        return hl.cond(allele_idx > 0, mt[variant_atypes][allele_idx - 1], hl.null(hl.tint32))

    exprs['allele_type_counts'] = hl.agg.explode(
        lambda elt: hl.agg.counter(elt),
        hl.range(0, mt['GT'].ploidy).map(lambda i: get_allele_type(mt['GT'][i])))

    mt = mt.annotate_cols(**{name: hl.struct(**exprs)})

    zero = hl.int64(0)

    select_exprs = {}
    if 'dp_stats' in exprs:
        select_exprs['dp_stats'] = mt[name].dp_stats
    if 'gq_stats' in exprs:
        select_exprs['gq_stats'] = mt[name].gq_stats

    select_exprs = {
        **select_exprs,
        'call_rate': hl.float64(mt[name].n_called) / (mt[name].n_called + mt[name].n_not_called),
        'n_called': mt[name].n_called,
        'n_not_called': mt[name].n_not_called,
        'n_hom_ref': mt[name].n_hom_ref,
        'n_het': mt[name].n_het,
        'n_hom_var': mt[name].n_called - mt[name].n_hom_ref - mt[name].n_het,
        'n_non_ref': mt[name].n_called - mt[name].n_hom_ref,
        'n_singleton': mt[name].n_singleton,
        'n_snp': mt[name].allele_type_counts.get(allele_ints["Transition"], zero) + \
                 mt[name].allele_type_counts.get(allele_ints["Transversion"], zero),
        'n_insertion': mt[name].allele_type_counts.get(allele_ints["Insertion"], zero),
        'n_deletion': mt[name].allele_type_counts.get(allele_ints["Deletion"], zero),
        'n_transition': mt[name].allele_type_counts.get(allele_ints["Transition"], zero),
        'n_transversion': mt[name].allele_type_counts.get(allele_ints["Transversion"], zero),
        'n_star': mt[name].allele_type_counts.get(allele_ints["Star"], zero)
    }

    mt = mt.annotate_cols(**{name: mt[name].select(**select_exprs)})

    mt = mt.annotate_cols(**{name: mt[name].annotate(
        r_ti_tv=divide_null(hl.float64(mt[name].n_transition), mt[name].n_transversion),
        r_het_hom_var=divide_null(hl.float64(mt[name].n_het), mt[name].n_hom_var),
        r_insertion_deletion=divide_null(hl.float64(mt[name].n_insertion), mt[name].n_deletion)
    )})        

    mt = mt.drop(variant_ac, variant_atypes)

    return mt
Example #18
0
def merge_sample_qc_expr(
    sample_qc_exprs: List[hl.expr.StructExpression],
) -> hl.expr.StructExpression:
    """
    Creates an expression that merges results from non-overlapping strata of hail.sample_qc

    E.g.:

    - Compute autosomes and sex chromosomes metrics separately, then merge results
    - Compute bi-allelic and multi-allelic metrics separately, then merge results

    Note regarding the merging of ``dp_stats`` and ``gq_stats``:
    Because ``n`` is needed to aggregate ``stdev``, ``n_called`` is used for this purpose.
    This should work very well on a standard GATK VCF and it essentially assumes that:

    - samples that are called have `DP` and `GQ` fields
    - samples that are not called do not have `DP` and `GQ` fields

    Even if these assumptions are broken for some genotypes, it shouldn't matter too much.

    :param sample_qc_exprs: List of sample QC struct expressions for each stratification
    :return: Combined sample QC results
    """

    # List of metrics that can be aggregated by summing
    additive_metrics = [
        "n_called",
        "n_not_called",
        "n_filtered",
        "n_hom_ref",
        "n_het",
        "n_hom_var",
        "n_snp",
        "n_insertion",
        "n_deletion",
        "n_singleton",
        "n_transition",
        "n_transversion",
        "n_star",
    ]

    # List of metrics that are ratio of summed metrics (name, nominator, denominator)
    ratio_metrics = [
        ("call_rate", "n_called", "n_not_called"),
        ("r_ti_tv", "n_transition", "n_transversion"),
        ("r_het_hom_var", "n_het", "n_hom_var"),
        ("r_insertion_deletion", "n_insertion", "n_deletion"),
    ]

    # List of metrics that are struct generated by a stats counter
    stats_metrics = ["gq_stats", "dp_stats"]

    # Gather metrics present in sample qc fields
    sample_qc_fields = set(sample_qc_exprs[0])
    for sample_qc_expr in sample_qc_exprs[1:]:
        sample_qc_fields = sample_qc_fields.union(set(sample_qc_expr))

    # Merge additive metrics in sample qc fields
    merged_exprs = {
        metric:
        hl.sum([sample_qc_expr[metric] for sample_qc_expr in sample_qc_exprs])
        for metric in additive_metrics if metric in sample_qc_fields
    }

    # Merge ratio metrics in sample qc fields
    merged_exprs.update({
        metric: hl.float64(divide_null(merged_exprs[nom], merged_exprs[denom]))
        for metric, nom, denom in ratio_metrics
        if nom in sample_qc_fields and denom in sample_qc_fields
    })

    # Merge stats counter metrics in sample qc fields
    # Use n_called as n for DP and GQ stats
    if "n_called" in sample_qc_fields:
        merged_exprs.update({
            metric: merge_stats_counters_expr([
                sample_qc_expr[metric].annotate(n=sample_qc_expr.n_called)
                for sample_qc_expr in sample_qc_exprs
            ]).drop("n")
            for metric in stats_metrics if metric in sample_qc_fields
        })

    return hl.struct(**merged_exprs)