def test_multi_write(self): mt = self.get_vds() f = new_temp_file() hl.experimental.write_matrix_tables([mt, mt], f) path1 = f + '0.mt' path2 = f + '1.mt' mt1 = hl.read_matrix_table(path1) mt2 = hl.read_matrix_table(path2) self.assertTrue(mt._same(mt1)) self.assertTrue(mt._same(mt2)) self.assertTrue(mt1._same(mt2))
def test_backward_compatability(self): import os all_values_table, all_values_matrix_table = create_all_values_datasets() table_dir = resource('backward_compatability/1.0.0/table') matrix_table_dir = resource('backward_compatability/1.0.0/matrix_table') n = 0 i = 0 f = os.path.join(table_dir, '{}.ht'.format(i)) while os.path.exists(f): ds = hl.read_table(f) self.assertTrue(ds._same(all_values_table)) i += 1 f = os.path.join(table_dir, '{}.ht'.format(i)) n += 1 i = 0 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) while os.path.exists(f): ds = hl.read_matrix_table(f) self.assertTrue(ds._same(all_values_matrix_table)) i += 1 f = os.path.join(matrix_table_dir, '{}.hmt'.format(i)) n += 1 self.assertEqual(n, 8)
def test_write_stage_locally(self): mt = self.get_vds() f = new_temp_file(suffix='mt') mt.write(f, stage_locally=True) mt2 = hl.read_matrix_table(f) self.assertTrue(mt._same(mt2))
def test_codecs_matrix(self): from hail.utils.java import scala_object codecs = scala_object(Env.hail().io, 'CodecSpec').codecSpecs() ds = self.get_vds() temp = new_temp_file(suffix='hmt') for codec in codecs: ds.write(temp, overwrite=True, _codec_spec=codec.toString()) ds2 = hl.read_matrix_table(temp) self.assertTrue(ds._same(ds2))
def test_fix3307_read_mt_wrong(self): mt = hl.import_vcf(resource('sample2.vcf')) mt = hl.split_multi_hts(mt) mt.write('/tmp/foo.mt', overwrite=True) mt2 = hl.read_matrix_table('/tmp/foo.mt') t = hl.read_table('/tmp/foo.mt/rows') self.assertTrue(mt.rows()._same(t)) self.assertTrue(mt2.rows()._same(t)) self.assertTrue(mt._same(mt2))
def run_combiner(sample_list, json, out_path, tmp_path, summary_path=None, overwrite=False): import gc # make the temp path a directory, no matter what tmp_path += f'/combiner-temporary/{uuid.uuid4()}/' vcfs = [comb.transform_one(vcf) for vcf in hl.import_vcfs(sample_list, json, array_elements_required=False)] combined = [comb.combine_gvcfs(mts) for mts in chunks(vcfs, MAX_COMBINER_LENGTH)] if len(combined) == 1: combined[0].write(out_path, overwrite=overwrite) else: hl.utils.java.info(f'Writing combiner temporary files to: {tmp_path}') i = 0 while len(combined) > 1: pad = len(str(len(combined))) hl.experimental.write_matrix_tables(combined, tmp_path + f'{i}/', overwrite=True) paths = [tmp_path + f'{i}/' + str(n).zfill(pad) + '.mt' for n in range(len(combined))] i += 1 wmts = [hl.read_matrix_table(path) for path in paths] combined = [comb.combine_gvcfs(mts) for mts in chunks(wmts, MAX_COMBINER_LENGTH)] gc.collect() # need to try to free memory on the master combined[0].write(out_path, overwrite=overwrite) if summary_path is not None: mt = hl.read_matrix_table(out_path) comb.summarize(mt).rows().write(summary_path, overwrite=overwrite)
def test_head(self): # no empty partitions mt1 = hl.utils.range_matrix_table(10, 10) # empty partitions at front mt2 = hl.utils.range_matrix_table(20, 10, 20) mt2 = mt2.filter_rows(mt2.row_idx > 9) mts = [mt1, mt2] for mt in mts: tmp_file = new_temp_file(suffix='mt') mt.write(tmp_file) mt_readback = hl.read_matrix_table(tmp_file) for mt_ in [mt, mt_readback]: assert mt_.head(1).count_rows() == 1 assert mt_.head(1)._force_count_rows() == 1 assert mt_.head(100).count_rows() == 10 assert mt_.head(100)._force_count_rows() == 10
def download_data(): global _initialized, _data_dir, _mt _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data') print(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), ['profile.vcf.bgz', 'profile.mt']) if not all(os.path.exists(file) for file in files): vcf = os.path.join(_data_dir, 'profile.vcf.bgz') print('files not found - downloading...', end='',flush=True) urlretrieve('https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', os.path.join(_data_dir, vcf)) print('done', flush=True) print('importing...', end='', flush=True) hl.import_vcf(vcf).write(os.path.join(_data_dir, 'profile.mt')) print('done', flush=True) else: print('all files found.', flush=True) _initialized = True _mt = hl.read_matrix_table(resource('profile.mt'))
def split_multi_hts(): mt = hl.read_matrix_table(resource('profile.mt')) hl.split_multi_hts(mt)._force_count_rows()
def test_ld_score_regression(self): ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint}) ht_50_irnt = hl.import_table( doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_50_irnt = ht_50_irnt.annotate( chi_squared=ht_50_irnt['Z']**2, n=ht_50_irnt['N'], ld_score=ht_scores[ht_50_irnt['SNP']]['L2'], locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'], ht_scores[ht_50_irnt['SNP']]['BP']), alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]), phenotype='50_irnt') ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'], ht_50_irnt['alleles']) ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'], ht_50_irnt['n'], ht_50_irnt['ld_score'], ht_50_irnt['phenotype']) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_20160 = ht_20160.annotate( chi_squared=ht_20160['Z']**2, n=ht_20160['N'], ld_score=ht_scores[ht_20160['SNP']]['L2'], locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'], ht_scores[ht_20160['SNP']]['BP']), alleles=hl.array([ht_20160['A2'], ht_20160['A1']]), phenotype='20160') ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) ht_20160 = ht_20160.select(ht_20160['chi_squared'], ht_20160['n'], ht_20160['ld_score'], ht_20160['phenotype']) ht = ht_50_irnt.union(ht_20160) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['phenotype'], row_fields=['ld_score'], col_fields=[]) mt_tmp = new_temp_file() mt.write(mt_tmp, overwrite=True) mt = hl.read_matrix_table(mt_tmp) ht_results = hl.experimental.ld_score_regression( weight_expr=mt['ld_score'], ld_score_expr=mt['ld_score'], chi_sq_exprs=mt['chi_squared'], n_samples_exprs=mt['n'], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results['50_irnt']['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results['20160']['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results['20160']['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results['20160']['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_standard_error'], 0.0416, places=4) ht = ht_50_irnt.annotate( chi_squared_50_irnt=ht_50_irnt['chi_squared'], n_50_irnt=ht_50_irnt['n'], chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'], n_20160=ht_20160[ht_50_irnt.key]['n']) ht_results = hl.experimental.ld_score_regression( weight_expr=ht['ld_score'], ld_score_expr=ht['ld_score'], chi_sq_exprs=[ht['chi_squared_50_irnt'], ht['chi_squared_20160']], n_samples_exprs=[ht['n_50_irnt'], ht['n_20160']], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results[0]['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results[0]['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results[0]['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results[0]['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results[0]['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results[1]['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results[1]['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results[1]['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results[1]['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results[1]['snp_heritability_standard_error'], 0.0416, places=4)
def matrix_table_rows_show(mt_path): mt = hl.read_matrix_table(mt_path) mt.rows().show(100)
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def matrix_table_entries_show(mt_path): mt = hl.read_matrix_table(mt_path) mt.entries().show()
def query(): """Query script entry point.""" hl.init(default_reference='GRCh38') # save relatedness estimates for pc_relate global populations ht = hl.read_table(PC_RELATE_ESTIMATE_GLOBAL) related_samples = ht.filter(ht.kin > 0.1) pc_relate_global = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_global_matrix.csv', 'analysis') pc_relate_global.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_global_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for pc_relate NFE samples ht = hl.read_table(PC_RELATE_ESTIMATE_NFE) related_samples = ht.filter(ht.kin > 0.1) pc_relate_nfe = pd.DataFrame({ 'i_s': related_samples.i.s.collect(), 'j_s': related_samples.j.s.collect(), 'kin': related_samples.kin.collect(), }) filename = output_path(f'pc_relate_nfe_matrix.csv', 'analysis') pc_relate_nfe.to_csv(filename, index=False) # get maximal independent set pairs = ht.filter(ht['kin'] >= 0.125) related_samples_to_remove = hl.maximal_independent_set( pairs.i, pairs.j, False) related_samples = pd.DataFrame( {'removed_individual': related_samples_to_remove.node.s.collect()}) filename = output_path(f'pc_relate_nfe_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False) # save relatedness estimates for KING NFE samples mt = hl.read_matrix_table(KING_ESTIMATE_NFE) ht = mt.entries() # remove entries where samples are identical related_samples = ht.filter(ht.s_1 != ht.s) related_samples = ht.filter(ht.phi > 0.1) king_nfe = pd.DataFrame({ 'i_s': related_samples.s_1.collect(), 'j_s': related_samples.s.collect(), 'kin': related_samples.phi.collect(), }) filename = output_path(f'king_nfe_matrix_90k.csv', 'analysis') king_nfe.to_csv(filename, index=False) # save KING NFE maximal independent set second_degree_related_samples = ht.filter( (ht.s_1 != ht.s) & (ht.phi > 0.125), keep=True) struct = hl.struct(i=second_degree_related_samples.s_1, j=second_degree_related_samples.s) struct = struct.annotate(phi=second_degree_related_samples.phi) related_samples_to_remove = hl.maximal_independent_set( struct.i, struct.j, False # pylint: disable=E1101 ) related_samples = pd.DataFrame( {'related_individual': related_samples_to_remove.node.collect()}) filename = output_path( f'king_90k_related_samples_maximal_independent_set.csv', 'analysis') related_samples.to_csv(filename, index=False)
ht_aj_samples = hl.import_table(AJ_LIST, no_header=True, key='f0') ht_pca_samples = hl.import_table(PCA_LIST, no_header=True, key='f0') ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0') ht_initial_variants = hl.import_table(INITIAL_VARIANT_LIST, types={ 'locus': hl.tlocus(reference_genome='GRCh38'), 'alleles': hl.tarray(hl.tstr) }) ht_initial_variants = ht_initial_variants.key_by(ht_initial_variants.locus, ht_initial_variants.alleles) mt = hl.read_matrix_table(MT) mt = mt.filter_cols(hl.is_defined(ht_pca_samples[mt.col_key])) mt = mt.filter_cols(~hl.is_defined(ht_aj_samples[mt.col_key])) mt = mt.filter_cols(~hl.is_defined(ht_ibd_samples[mt.col_key])) mt = mt.filter_rows(hl.is_defined(ht_initial_variants[mt.row_key])) mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key]) mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key]) mt = hl.variant_qc(mt, name='qc') mt = mt.annotate_rows(qc=mt.qc.annotate( AC=mt.qc.AC[1], AF=mt.qc.AF[1], homozygote_count=mt.qc.homozygote_count[1]))
def group_by_collect_per_row(path): ht = hl.read_matrix_table(path).localize_entries('e', 'c') ht.group_by(*ht.key).aggregate(value=hl.agg.collect(ht.row_value))._force_count()
# Add file handler fh = logging.FileHandler(args.log_file) fh.setLevel(logging.INFO) fh.setFormatter(formatter) root.addHandler(fh) # Add streaming handler ch = logging.StreamHandler(sys.stdout) ch.setLevel(logging.INFO) ch.setFormatter(formatter) root.addHandler(ch) ######################## # Read in matrix table # ######################## qcd_mt = hl.read_matrix_table(args.mt) ########################################### # Validate parent/offspring relationships # ########################################### validated_fam = validate_pedigree(args.fam, args.kin, args) ######################## # Find denovo variants # ######################## denovo_table = get_denovos(validated_fam, qcd_mt, args) denovo_table.write(args.output_stem + "_denovo_variants.ht", overwrite=True) denovo_table = denovo_table.flatten() denovo_table = denovo_table.drop("vep.input")
if __name__ == "__main__": # need to create spark cluster first before intiialising hail sc = pyspark.SparkContext() # Define the hail persistent storage directory tmp_dir = "hdfs://spark-master:9820/" temp_dir = "file:///home/ubuntu/data/tmp" plot_dir = "/home/ubuntu/data/tmp" hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) # read matrixtable = remove the # mt = hl.read_matrix_table( # f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts_split_multi.mt') mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_chr1-7and20_split.mt' ) samples_to_remove_filename = f"{temp_dir}/ddd-elgh-ukbb/filtering/samples_failed_QC.tsv" samples_to_remove = hl.import_table(samples_to_remove_filename).key_by('s') mt_filtered = mt.filter_cols(hl.is_defined(samples_to_remove[mt.s]), keep=False) mt_filtered.write( f'{tmp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-7and20_split_sampleqc_filtered.mt' )
def write_profile_mt(mt_path): with TemporaryDirectory() as tmpdir: hl.read_matrix_table(mt_path).write(path.join(tmpdir, 'tmp.mt'))
def matrix_table_take_col(mt_path): mt = hl.read_matrix_table(mt_path) mt.s.take(100)
def matrix_table_take_row(mt_path): mt = hl.read_matrix_table(mt_path) mt.info.AF.take(100)
def matrix_table_rows_force_count(mt_path): ht = hl.read_matrix_table(mt_path).rows().key_by() ht._force_count()
def group_by_take_rekey(path): ht = hl.read_matrix_table(path).localize_entries('e', 'c') ht.group_by(k=hl.int(ht.row_idx / 50)).aggregate(value=hl.agg.take(ht.row_value, 1))._force_count()
def matrix_table_entries_table(mt_path): mt = hl.read_matrix_table(mt_path) mt.entries()._force_count()
def main(args): n_partitions = 500 # ANNOTATION TABLES: truth_data_ht = hl.read_table(args.truthset_table) trio_stats_table = hl.read_table(args.trio_stats_table) #inbreeding_ht = hl.read_table(f'{temp_dir}/ddd-elgh-ukbb/variant_qc/Sanger_cohorts_inbreeding.ht') allele_data_ht = hl.read_table(args.allele_data) allele_counts_ht = hl.read_table(args.allele_count) allele_counts_ht = allele_counts_ht.select( *['ac_qc_samples_raw', 'ac_qc_samples_adj']) inbreeding_ht = hl.read_table(args.inbreeding) group = "raw" mt = hl.read_matrix_table(args.matrixtable) mt = mt.key_rows_by('locus').distinct_by_row().key_rows_by( 'locus', 'alleles') # mt = mt.select_entries( # GT=hl.unphased_diploid_gt_index_call(mt.GT.n_alt_alleles())) # mt = mt.annotate_rows(InbreedingCoeff=hl.or_missing( # ~hl.is_nan(mt.info.InbreedingCoeff), mt.info.InbreedingCoeff)) ht = mt.rows() ht = ht.transmute(**ht.info) #ht = ht.select("FS", "MQ", "QD", "InbreedingCoeff", *INFO_FEATURES) trio_stats_ht = trio_stats_table.select(f"n_transmitted_{group}", f"ac_children_{group}") ht = ht.annotate( **inbreeding_ht[ht.key], **trio_stats_ht[ht.key], **truth_data_ht[ht.key], **allele_data_ht[ht.key].allele_data, **allele_counts_ht[ht.key], ) # Filter to only variants found in high quality samples or controls with no LowQual filter # ht = ht.filter( # (ht[f"ac_children_{group}"] > 0) # ) # TODO: change to AS_lowqual for v3.1 or leave as is to be more consistent with v3.0? I will need to add this annotation if so ht = ht.annotate(fail_hard_filters=(ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30)) ht = ht.annotate(ac_raw=ht.ac_qc_samples_raw) ht = ht.annotate(transmitted_singleton=(ht[f"n_transmitted_{group}"] == 1) & (ht[f"ac_qc_samples_{group}"] == 2)) # the following only selects the required RF fields but I commented it out because some of the fields excluded are needed later # ht = ht.select( # "a_index", # "was_split", # *FEATURES, # *TRUTH_DATA, # **{ # "transmitted_singleton": (ht[f"n_transmitted_{group}"] == 1) # & (ht[f"ac_qc_samples_{group}"] == 2), # "fail_hard_filters": (ht.QD < 2) | (ht.FS > 60) | (ht.MQ < 30), # }, # ac_raw=ht.ac_qc_samples_raw # ) ht = ht.repartition(n_partitions, shuffle=False) ht = ht.checkpoint( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_table_for_RF_all_cols.ht', overwrite=True) ht = median_impute_features(ht, {"variant_type": ht.variant_type}) ht = ht.checkpoint( f'{args.output_dir}/ddd-elgh-ukbb/Sanger_table_for_RF_by_variant_type_all_cols.ht', overwrite=True)
from pprint import pprint hl.plot.output_notebook() MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt' PCA_SCORES_EUR = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/11_pca_scores.strict_european.tsv' PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht' INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list' PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list' IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list' # These are the strictly defined European samples. EUROPEAN_SAMPLES_STRICT = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/10_european.strict.sample_list' mt = hl.read_matrix_table(MT_HARDCALLS) sample_annotations = hl.read_table(PHENOTYPES_TABLE) ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0') ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True) ht_ibd_samples = hl.import_table(IBD_SAMPLES, no_header=True, key='f0') ht_eur_samples_strict = hl.import_table(EUROPEAN_SAMPLES_STRICT, no_header=True, key='f0') ht_pruned_variants = ht_pruned_variants.annotate( **hl.parse_variant(ht_pruned_variants.f0, reference_genome='GRCh38')) ht_pruned_variants = ht_pruned_variants.key_by(ht_pruned_variants.locus, ht_pruned_variants.alleles) mt = mt.filter_cols(hl.is_defined(ht_initial_samples[mt.col_key]))
hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') cohorts_pop = hl.import_table( "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb.tsv", delimiter="\t").key_by('s') # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt" ) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) # done the above on pca_RF jupyter notebook # mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt") #mt = hl.split_multi_hts( mt, keep_star=False, left_aligned=False) mt.write( f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_ld_pruned.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt # mt = hl.read_matrix_table(
var_metadata_path = 'gs://gcp-public-data--gnomad/release/3.1.1/ht/genomes/gnomad.genomes.v3.1.1.sites.ht' # path for Konrad's densified matrix table dense_mt_path = 'gs://hgdp_tgp/output/tgp_hgdp.mt' # reading in Alicia's sample metadata file (Note: this file uses the 'v3.1::' prefix as done in gnomAD) sample_meta = hl.import_table(sample_metadata_path, impute=True) # reading in Julia's sample metadata file jul_meta = hl.read_table(jul_metadata_path) # reading in variant qc information var_meta = hl.read_table(var_metadata_path) # reading in densified matrix table dense_mt = hl.read_matrix_table(dense_mt_path) # These bits below were written by Tim Poterba to help troubleshoot unflattening a ht with nested structure # dict to hold struct names as well as nested field names d = {} # Getting just the row field names row = sample_meta.row_value # returns a dict with the struct names as keys and their inner field names as values for name in row: def recur(dict_ref, split_name): if (len(split_name) == 1): dict_ref[split_name[0]] = row[name] return
# Define the hail persistent storage directory tmp_dir = "hdfs://spark-master:9820/" temp_dir = os.path.join(os.environ["HAIL_HOME"], "tmp") hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38") # s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) ##################################################################### ###################### INPUT DATA ############################## ##################################################################### CHROMOSOME = "WGS" mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_annotated.mt") mt = mt.key_rows_by('locus').distinct_by_row( ).key_rows_by('locus', 'alleles') mt_split = hl.split_multi_hts(mt, keep_star=False, left_aligned=False) mt_split = mt_split.checkpoint( f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_checkpoint.mt", overwrite=True) print("Finished splitting and writing mt. ") mt = mt_split.annotate_rows( Variant_Type=hl.cond((hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion( mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other"))))
import hail as hl GOLD_STD = 'gs://hail-common/vep/vep/vep_examplars/vep_no_csq_35d9e30.mt/' GOLD_STD_CSQ = 'gs://hail-common/vep/vep/vep_examplars/vep_csq_23673e70.mt/' for path, csq in [(GOLD_STD, False), (GOLD_STD_CSQ, True)]: print(f"Checking 'hl.vep' replicates on '{path}'") expected = hl.read_matrix_table(path) actual = hl.vep(expected.select_rows(), 'gs://hail-common/vep/vep/vep85-loftee-gcloud.json', csq=csq) vep_result_agrees = actual._same(expected) if vep_result_agrees: print('TEST PASSED') else: print('TEST FAILED') assert vep_result_agrees
**add_variant_type(mt.alleles))) mt = hl.split_multi_hts(mt, left_aligned=True) allele_type = (hl.case().when( hl.is_snp(mt.alleles[0], mt.alleles[1]), 'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]), 'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]), 'del').default('complex')) mt = mt.annotate_rows(allele_data=mt.allele_data.annotate( allele_type=allele_type, was_mixed=mt.allele_data.variant_type == 'mixed')) return mt # Read in the matrix table mt = hl.read_matrix_table('aatd.mt') # Left normalize and split alleles split = generate_split_alleles(mt) mts = hl.variant_qc(split) # Hard-filtering germline short variants mts = mts.filter_rows(mts.info.QD >= 2) mts = mts.filter_rows(mts.info.FS <= 60) mts = mts.filter_rows(mts.info.SOR <= 3) mts = mts.filter_rows(mts.info.MQ >= 40) mts = mts.filter_rows(mts.info.MQRankSum >= -12.5) mts = mts.filter_rows(mts.info.ReadPosRankSum >= -8) mts_mq_40 = mts.filter_rows(mts.info.MQ >= 40) mts_mq_60 = mts.filter_rows(mts.info.MQ >= 60)
def matrix_table_aggregate_entries(): mt = hl.read_matrix_table(resource('profile.mt')) mt.aggregate_entries(hl.agg.stats(mt.GQ))
def load_dataset(name, version, reference_genome, config_file='gs://hail-datasets/datasets.json'): """Load a genetic dataset from Hail's repository. Example ------- >>> # Load 1000 Genomes MatrixTable with GRCh38 coordinates >>> mt_1kg = hl.experimental.load_dataset(name='1000_genomes', # doctest: +SKIP ... version='phase3', ... reference_genome='GRCh38') Parameters ---------- name : :obj:`str` Name of the dataset to load. version : :obj:`str` Version of the named dataset to load (see available versions in documentation). reference_genome : `GRCh37` or `GRCh38` Reference genome build. Returns ------- :class:`.Table` or :class:`.MatrixTable`""" with hl.hadoop_open(config_file, 'r') as f: datasets = json.load(f) names = set([dataset['name'] for dataset in datasets]) if name not in names: raise ValueError('{} is not a dataset available in the repository.'.format(repr(name))) versions = set([dataset['version'] for dataset in datasets if dataset['name']==name]) if version not in versions: raise ValueError("""Version {0} not available for dataset {1}. Available versions: {{{2}}}.""".format(repr(version), repr(name), repr('","'.join(versions)))) reference_genomes = set([dataset['reference_genome'] for dataset in datasets if dataset['name']==name]) if reference_genome not in reference_genomes: raise ValueError("""Reference genome build {0} not available for dataset {1}. Available reference genome builds: {{'{2}'}}.""".format(repr(reference_genome), repr(name), '\',\''.join((reference_genomes)))) path = [dataset['path'] for dataset in datasets if all([dataset['name']==name, dataset['version']==version, dataset['reference_genome']==reference_genome])][0].strip('/') if path.endswith('.ht'): dataset = hl.read_table(path) else: if not path.endswith('.mt'): raise ValueError('Invalid path {}: can only load datasets with .ht or .mt extensions.'.format(repr(path))) dataset = hl.read_matrix_table(path) return dataset
def matrix_table_cols_show(mt_path): mt = hl.read_matrix_table(mt_path) mt.cols().show(100)
def matrix_table_entries_table(): mt = hl.read_matrix_table(resource('profile.mt')) mt.entries()._force_count()
def export_vcf(): mt = hl.read_matrix_table(resource('profile.mt')) out = hl.utils.new_temp_file(suffix='vcf.bgz') hl.export_vcf(mt, out)
def matrix_table_rows_is_transition(): ht = hl.read_matrix_table(resource('profile.mt')).rows().key_by() ht.select(is_snp = hl.is_snp(ht.alleles[0], ht.alleles[1]))._force_count()
def matrix_table_show(mt_path): mt = hl.read_matrix_table(mt_path) mt.show(100)
def matrix_table_many_aggs_col_wise(): mt = hl.read_matrix_table(resource('profile.mt')) mt = mt.annotate_cols(**many_aggs(mt)) mt.cols()._force_count()
def matrix_table_entries_table_no_key(mt_path): mt = hl.read_matrix_table(mt_path).key_rows_by().key_cols_by() mt.entries()._force_count()
def read_decode_gnomad_coverage(mt_path): hl.read_matrix_table(mt_path)._force_count_rows()
hl.init() from hail.plot import show from pprint import pprint hl.plot.output_notebook() # The following was run using these jars and zips (to enable the new version of split_multi). # gs://hail-common/builds/0.2/jars/hail-0.2-7a280b932abc959f7accf11124f3255038f48c95-Spark-2.4.0.jar # gs://hail-common/builds/0.2/python/hail-0.2-7a280b932abc959f7accf11124f3255038f48c95.zip RAW_MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/dalio_bipolar_w1_w2/Dalio_W1_W2_GRCh38_exomes.mt' MT = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.mt' MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt' # Remove multiallelics with 100 or more alleles... mt = hl.read_matrix_table(RAW_MT) # Count before splitting multi-allelics. n = mt.count() pprint('n samples:') print(n[1]) pprint('n variants:') print(n[0]) # Read in the target intervals TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list' # Import the interval lists for the target intervals. target_intervals = hl.import_locus_intervals(TARGET_INTERVALS, reference_genome='GRCh38')
def matrix_table_take_entry(mt_path): mt = hl.read_matrix_table(mt_path) mt.GT.take(100)
def matrix_table_array_arithmetic(): mt = hl.read_matrix_table(resource('profile.mt')) mt = mt.filter_rows(mt.alleles.length() == 2) mt.select_entries(dosage = hl.pl_dosage(mt.PL)).select_rows()._force_count_rows()
def get_gnomad_data(data_type: str, adj: bool = False, split: bool = True, raw: bool = False, non_refs_only: bool = False, hail_version: str = CURRENT_HAIL_VERSION, meta_version: str = None, meta_root: Optional[str] = 'meta', full_meta: bool = False, fam_version: str = CURRENT_FAM, fam_root: str = None, duplicate_mapping_root: str = None, release_samples: bool = False, release_annotations: bool = None) -> hl.MatrixTable: """ Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered) :param str data_type: One of `exomes` or `genomes` :param bool adj: Whether the returned data should be filtered to adj genotypes :param bool split: Whether the dataset should be split (only applies to raw=False) :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes) :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes) :param str hail_version: One of the HAIL_VERSIONs :param str meta_version: Version of metadata (None for current) :param str meta_root: Where to put metadata. Set to None if no metadata is desired. :param str full_meta: Whether to add all metadata (warning: large) :param str fam_version: Version of metadata (default to current) :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired. :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate) :param bool release_samples: When set, filters the data to release samples only :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data :return: gnomAD hardcalls dataset with chosen annotations :rtype: MatrixTable """ #from gnomad_hail.utils import filter_to_adj if raw and split: raise DataException( 'No split raw data. Use of hardcalls is recommended.') if non_refs_only: mt = hl.read_matrix_table( get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version)) else: mt = hl.read_matrix_table( get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version)) if adj: mt = filter_to_adj(mt) if meta_root: meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta) mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]}) if duplicate_mapping_root: dup_ht = hl.import_table( genomes_exomes_duplicate_ids_tsv_path, impute=True, key='exome_id' if data_type == "exomes" else 'genome_id') mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]}) if fam_root: fam_ht = hl.import_fam(fam_path(data_type, fam_version)) mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]}) if release_samples: mt = mt.filter_cols(mt.meta.release) if release_annotations: sites_mt = get_gnomad_public_data(data_type, split, release_annotations) mt = mt.select_rows(release=sites_mt[ mt.v, :]) # TODO: replace with ** to nuke old annotations return mt
def matrix_table_rows_force_count(): ht = hl.read_matrix_table(resource('profile.mt')).rows().key_by() ht._force_count()
variant_qc_table_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/5b_variant_qc_table.txt.gz' #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # define constants #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ mincallrate = 0.98 hwep = 0.000001 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # read data #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds = hl.read_matrix_table(qced_vds_file) samples_to_keep = hl.import_table(samples_to_keep_file, no_header=True).key_by('f0') #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # fix HWE #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ step0 = vds.variant_qc.AC > 0 step1 = vds.filters.size() == 0 step2 = vds.info.QD > 4 step3 = vds.lowestcallrate >= mincallrate step4 = vds.lowestphwe >= hwep qc_struct = hl.Struct(step0 = step0, step1 = step1, step2 = step2, step3=step3, step4=step4)
def matrix_table_decode_and_count(): mt = hl.read_matrix_table(resource('profile.mt')) mt._force_count_rows()
MT_HARDCALLS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/bipolar_wes_dalio_W1_W2/filterGT_GRCh38_6_multi.hardcalls.mt' # MT_1KG = 'gs://raw_data_bipolar/data/ALL.1KG.qc.hardcalls.mt' # Old b37 version MT_1KG = 'gs://hail-datasets-hail-data/1000_Genomes_autosomes.phase_3.GRCh38.mt' # Check the size of this guy. # Make sure that the same samples are used in 38....there's less here. PCA_SCORES_1KG = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/10_pca_scores_1kg.tsv' PHENOTYPES_TABLE = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/phenotypes.ht' # POPULATIONS_1KG = 'gs://raw_data_bipolar/inputs/samples_1kg.tsv' POPULATIONS_1KG = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/samples_1kg.ht' INITIAL_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_qc.keep.sample_list' PRUNED_VARIANTS = 'gs://dalio_bipolar_w1_w2_hail_02/data/variants/04_prune.keep.variant_list' IBD_SAMPLES = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/06_ibd.remove.sample_list' mt_1kg = hl.read_matrix_table(MT_1KG) mt_1kg = hl.split_multi_hts(mt_1kg) # This is to enable a join later. mt_1kg = mt_1kg.select_entries("GT") # Write this to annotate with later mt_1kg.cols().write(output=POPULATIONS_1KG, overwrite=True) # This is also to enable a join later. mt_1kg = mt_1kg.select_cols() populations_1kg = hl.read_table(POPULATIONS_1KG) mt = hl.read_matrix_table(MT_HARDCALLS) sample_annotations = hl.read_table(PHENOTYPES_TABLE) ht_initial_samples = hl.import_table(INITIAL_SAMPLES, no_header=True, key='f0') ht_pruned_variants = hl.import_table(PRUNED_VARIANTS, no_header=True)
import sys import hail as hl sys.path.insert(0, '/home/danfengc/unicorn') from qc.utils import * from qc.plotting import * from main.pca import * ###################################################################################################### # This part of the code is to use to convert AJ MatrixTable to PLINK format # ###################################################################################################### # convert it to plink format mt_AJ = hl.read_matrix_table( 'gs://unicorn-resources/Ashkenazi_Jewish_Samples/Ashkenazi_Jewish_Samples.mt' ) hl.export_plink( mt_AJ, output= 'gs://unicorn-resources/Ashkenazi_Jewish_Samples/Ashkenazi_Jewish_Samples', varid=mt_AJ.rsid, cm_position=mt_AJ.cm_position) ###################################################################################################### # This part of the code is to convert 1KG PLINK BFILE to hail matrix table # # The source of the 1KG PLINK bfiles: # # `/psych/genetics_data/ripke/references_outdated/hapmap_ref/impute2_ref/1KG_Aug12/ # # ALL_1000G_phase1integrated_v3_impute_macGT1/4pops/qc/pop_4pop_mix_SEQ` # # It is a cleaned PLINK BFILE generated by Stephan. # ###################################################################################################### mt_1kg = hl.import_plink( bed=
def per_row_stats_star_star(mt_path): mt = hl.read_matrix_table(mt_path) mt.annotate_rows(**hl.agg.stats(mt.x))._force_count_rows()