def test_linear_mixed_regression_full_rank(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() h2_fastlmm = 0.142761 h2_places = 6 beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170] pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204] mt_chr1 = mt.filter_rows(mt.locus.contig == '1') model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit() self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles())) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model) assert np.allclose(ht.beta.collect(), beta_fastlmm) assert np.allclose(ht.p_value.collect(), pval_hail)
def test_import_keyby_count_ldsc_lowered_shuffle(self): # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles # and RDD serialization, 2021-07-06 # if this comment no longer reflects the backend system, that's a really good thing ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={ 'L2': hl.tfloat, 'BP': hl.tint }) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={ 'N': hl.tint, 'Z': hl.tfloat }) j1 = ht_scores[ht_20160['SNP']] ht_20160 = ht_20160.annotate(ld_score=j1['L2'], locus=hl.locus(j1['CHR'], j1['BP']), alleles=hl.array( [ht_20160['A2'], ht_20160['A1']])) ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) assert ht_20160._force_count() == 151
def test_linear_mixed_regression_low_rank(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() h2_hail = 0.10001626 beta_hail = [0.0073201542, 0.039969148, -0.036727875, 0.29852363, -0.049212500] pval_hail = [0.90685162, 0.54839177, 0.55001054, 9.85247263e-07, 0.42796507] mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit() self.assertTrue(model.low_rank) self.assertAlmostEqual(model.h_sq, h2_hail) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles())) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model) assert np.allclose(ht.beta.collect(), beta_hail) assert np.allclose(ht.p_value.collect(), pval_hail)
def table_import_ints(tsv): hl.import_table(tsv, types={ 'idx': 'int', **{f'i{i}': 'int' for i in range(5)}, **{f'array{i}': 'array<int>' for i in range(2)} })._force_count()
def get_hq_samples(): ht = hl.import_table(f'{bucket}/misc/ukb31063_samples_qc_FULL.txt', no_header=True) drop_samples = hl.import_table( f'{bucket}/misc/ukb31063.withdrawn_samples_20190321.txt', no_header=True, key='f0') ht = ht.key_by(s=ht.f0).drop('f0') return ht.filter(hl.is_missing(drop_samples[ht.s]))
def table_import_ints(): hl.import_table(resource('many_ints_table.tsv.bgz'), types={ 'idx': 'int', **{f'i{i}': 'int' for i in range(5)}, **{f'array{i}': 'array<int>' for i in range(2)} })._force_count()
def gwas_on_gcta_splits(min_id, max_id, parsplit, paridx): rep_ids = range( min_id + paridx - 1, max_id + 1, parsplit ) #replicate "IDs", which were used as seeds to generate the random split ids = hl.import_table('gs://nbaya/split/gcta/gcta_20k.grm.id', no_header=True) #GRM ids ids = ids.rename({'f0': 'FID', 'f1': 'IID'}) ids = set(ids.IID.take(ids.count())) mt0 = hl.read_matrix_table('gs://nbaya/ldscsim/hm3.50_sim_h2_0.08.mt/') mt1 = mt0.filter_cols(hl.literal(ids).contains(mt0.s)) for i in rep_ids: try: y1_complete = subprocess.check_output([ 'gsutil', 'ls', f'gs://nbaya/split/gcta/20k_sumstats.y1.s{i}.tsv.bgz' ]) != None except: y1_complete = False try: y2_complete = subprocess.check_output([ 'gsutil', 'ls', f'gs://nbaya/split/gcta/20k_sumstats.y2.s{i}.tsv.bgz' ]) != None except: y2_complete = False if not (y1_complete and y2_complete): phen = hl.import_table(f'gs://nbaya/split/gcta/gcta_20k.s{i}.phen', types={ 'y1': hl.tfloat64, 'y2': hl.tfloat64 }, key='IID') mt = mt1.annotate_cols(y1=phen[mt1.s].y1, y2=phen[mt1.s].y2) if not y1_complete: print( f'\r##########\nRunning GWAS for y1 replicate {i} \n##########' ) gwas(mt=mt, x=mt.dosage, y=mt.y1, is_std_cov_list=True, path_to_save= f'gs://nbaya/split/gcta/20k_sumstats.y1.s{i}.tsv.bgz') if not y2_complete: print( f'\r##########\nRunning GWAS for y2 replicate {i} \n##########' ) gwas(mt=mt, x=mt.dosage, y=mt.y2, is_std_cov_list=True, path_to_save= f'gs://nbaya/split/gcta/20k_sumstats.y2.s{i}.tsv.bgz') else: print( f'\r##########\n GWAS already complete for y1 and y2 for replicate {i} \n##########' )
def download_data(): global _data_dir, _mt _data_dir = os.environ.get('HAIL_BENCHMARK_DIR', '/tmp/hail_benchmark_data') print(f'using benchmark data directory {_data_dir}') os.makedirs(_data_dir, exist_ok=True) files = map(lambda f: os.path.join(_data_dir, f), [ 'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht', 'table_10M_par_100.ht', 'table_10M_par_10.ht', 'gnomad_dp_simulation.mt', 'many_strings_table.ht' ]) if not all(os.path.exists(file) for file in files): hl.init() # use all cores vcf = os.path.join(_data_dir, 'profile.vcf.bgz') print('files not found - downloading...', end='', flush=True) urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz', vcf) print('done', flush=True) print('importing...', end='', flush=True) hl.import_vcf(vcf, min_partitions=16).write(os.path.join( _data_dir, 'profile.mt'), overwrite=True) ht = hl.utils.range_table( 10_000_000, 1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1) for i in range(5)}) ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'), overwrite=True) ht = ht.naive_coalesce(100).checkpoint(os.path.join( _data_dir, 'table_10M_par_100.ht'), overwrite=True) ht.naive_coalesce(10).write(os.path.join(_data_dir, 'table_10M_par_10.ht'), overwrite=True) mt = hl.utils.range_matrix_table(n_rows=250_000, n_cols=1_000, n_partitions=32) mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3)) mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'), overwrite=True) print('downloading many strings table...') mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz') mst_ht = os.path.join(_data_dir, 'many_strings_table.ht') urlretrieve( 'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz', mst_tsv) print('importing...') hl.import_table(mst_tsv).write(mst_ht, overwrite=True) hl.stop() else: print('all files found.', flush=True)
def load_prescription_data(prescription_data_tsv_path: str, prescription_mapping_tsv_path): ht = hl.import_table(prescription_data_tsv_path, types={'eid': hl.tint, 'data_provider': hl.tint}, key='eid') mapping_ht = hl.import_table(prescription_mapping_tsv_path, impute=True, key='Original_Prescription') ht = ht.annotate(issue_date=hl.cond(hl.len(ht.issue_date) == 0, hl.null(hl.tint64), hl.experimental.strptime(ht.issue_date + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')), **mapping_ht[ht.drug_name]) ht = ht.filter(ht.Generic_Name != '').key_by('eid', 'Generic_Name', 'Drug_Category_and_Indication').collect_by_key() ht = ht.annotate(values=hl.sorted(ht.values, key=lambda x: x.issue_date)) return ht.to_matrix_table(row_key=['eid'], col_key=['Generic_Name'], col_fields=['Drug_Category_and_Indication'])
def _create(self, resource_dir): download(resource_dir, 'many_ints_table.tsv.bgz') logging.info('importing many_ints_table.tsv.bgz...') hl.import_table(os.path.join(resource_dir, 'many_ints_table.tsv.bgz'), types={'idx': 'int', **{f'i{i}': 'int' for i in range(5)}, **{f'array{i}': 'array<int>' for i in range(2)}}) \ .write(os.path.join(resource_dir, 'many_ints_table.ht'), overwrite=True) logging.info('done importing many_ints_table.tsv.bgz.')
def prepare_gtex_expression_data(transcript_tpms_path, sample_annotations_path, tmp_path): # Recompress tpms file with block gzip so that import_matrix_table will read the file ds = hl.import_table(transcript_tpms_path, force=True) tmp_transcript_tpms_path = tmp_path + "/" + transcript_tpms_path.split( "/")[-1].replace(".gz", ".bgz") ds.export(tmp_transcript_tpms_path) # Import data ds = hl.import_matrix_table( tmp_transcript_tpms_path, row_fields={ "transcript_id": hl.tstr, "gene_id": hl.tstr }, entry_type=hl.tfloat, ) ds = ds.rename({"col_id": "sample_id"}) ds = ds.repartition(1000, shuffle=True) samples = hl.import_table(sample_annotations_path, key="SAMPID") # Separate version numbers from transcript and gene IDs ds = ds.annotate_rows( transcript_id=ds.transcript_id.split(r"\.")[0], transcript_version=hl.int(ds.transcript_id.split(r"\.")[1]), gene_id=ds.gene_id.split(r"\.")[0], gene_version=hl.int(ds.gene_id.split(r"\.")[1]), ) # Annotate columns with the tissue the sample came from ds = ds.annotate_cols(tissue=samples[ds.sample_id].SMTSD) # Collect expression into median across all samples in each tissue ds = ds.group_cols_by(ds.tissue).aggregate(**{ "": hl.agg.approx_median(ds.x) }).make_table() # Format tissue names other_fields = { "transcript_id", "transcript_version", "gene_id", "gene_version" } tissues = [f for f in ds.row_value.dtype.fields if f not in other_fields] ds = ds.transmute(tissues=hl.struct( **{format_tissue_name(tissue): ds[tissue] for tissue in tissues})) ds = ds.key_by("transcript_id").drop("row_id") return ds
def remap_samples( original_mt_path: str, input_mt: hl.MatrixTable, pedigree: hl.Table, inferred_sex: str, ) -> Tuple[hl.MatrixTable, hl.Table]: """ Rename `s` col in the MatrixTable and inferred sex ht. :param original_mt_path: Path to original MatrixTable location :param input_mt: MatrixTable :param pedigree: Pedigree file from seqr loaded as a Hail Table :param inferred_sex: Path to text file of inferred sexes :return: mt and sex ht with sample names remapped """ base_path = "/".join( dirname(original_mt_path).split("/")[:-1]) + ("/base/projects") project_list = list(set(pedigree.Project_GUID.collect())) # Get the list of hts containing sample remapping information for each project remap_hts = [] logger.info("Found %d projects that need to be remapped.", len(remap_hts)) sex_ht = hl.import_table(inferred_sex) for i in project_list: remap = f"{base_path}/{i}/{i}_remap.tsv" if hl.hadoop_is_file(remap): remap_ht = hl.import_table(remap) remap_ht = remap_ht.key_by("s", "seqr_id") remap_hts.append(remap_ht) if len(remap_hts) > 0: ht = remap_hts[0] for next_ht in remap_hts[1:]: ht = ht.join(next_ht, how="outer") # If a sample has a non-missing value for seqr_id, rename it to the sample name for the mt and sex ht ht = ht.key_by("s") input_mt = input_mt.annotate_cols(seqr_id=ht[input_mt.s].seqr_id) input_mt = input_mt.key_cols_by(s=hl.if_else( hl.is_missing(input_mt.seqr_id), input_mt.s, input_mt.seqr_id)) sex_ht = sex_ht.annotate(seqr_id=ht[sex_ht.s].seqr_id).key_by("s") sex_ht = sex_ht.key_by(s=hl.if_else(hl.is_missing(sex_ht.seqr_id), sex_ht.s, sex_ht.seqr_id)) else: sex_ht = sex_ht.key_by("s") return input_mt, sex_ht
def get_all_sample_metadata( mt: hl.MatrixTable, build: int, data_type: str, data_source: str, version: int ) -> hl.Table: """ Annotate MatrixTable with all current metadata: sample sequencing metrics, sample ID mapping, and callrate for bi-allelic, high-callrate common SNPs. :param MatrixTable mt: VCF converted to a MatrixTable :param int build: build for write, 37 or 38 :param str data_type: WGS or WES for write path and flagging metrics :param str data_source: internal or external for write path :param int version: Int for write path :return: Table with seq metrics and mapping :rtype: Table """ logger.info("Importing and annotating with sequencing metrics...") meta_ht = hl.import_table( seq_metrics_path(build, data_type, data_source, version), impute=True ).key_by("SAMPLE") logger.info("Importing and annotating seqr ID names...") remap_ht = hl.import_table( remap_path(build, data_type, data_source, version), impute=True ).key_by("s") meta_ht = meta_ht.annotate(**remap_ht[meta_ht.key]) meta_ht = meta_ht.annotate( seqr_id=hl.if_else( hl.is_missing(meta_ht.seqr_id), meta_ht.SAMPLE, meta_ht.seqr_id ) ) logger.info( "Filtering to bi-allelic, high-callrate, common SNPs to calculate callrate..." ) mt = filter_rows_for_qc( mt, min_af=0.001, min_callrate=0.99, bi_allelic_only=True, snv_only=True, apply_hard_filters=False, min_inbreeding_coeff_threshold=None, min_hardy_weinberg_threshold=None, ) callrate_ht = mt.select_cols( filtered_callrate=hl.agg.fraction(hl.is_defined(mt.GT)) ).cols() meta_ht = meta_ht.annotate(**callrate_ht[meta_ht.key]) return meta_ht
def get_gnomad_data(data_type: str, adj: bool = False, split: bool = True, raw: bool = False, non_refs_only: bool = False, hail_version: str = CURRENT_HAIL_VERSION, meta_version: str = None, meta_root: Optional[str] = 'meta', full_meta: bool = False, fam_version: str = CURRENT_FAM, fam_root: str = None, duplicate_mapping_root: str = None, release_samples: bool = False, release_annotations: bool = None) -> hl.MatrixTable: """ Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered) :param str data_type: One of `exomes` or `genomes` :param bool adj: Whether the returned data should be filtered to adj genotypes :param bool split: Whether the dataset should be split (only applies to raw=False) :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes) :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes) :param str hail_version: One of the HAIL_VERSIONs :param str meta_version: Version of metadata (None for current) :param str meta_root: Where to put metadata. Set to None if no metadata is desired. :param str full_meta: Whether to add all metadata (warning: large) :param str fam_version: Version of metadata (default to current) :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired. :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate) :param bool release_samples: When set, filters the data to release samples only :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data :return: gnomAD hardcalls dataset with chosen annotations :rtype: MatrixTable """ from gnomad_hail.utils import filter_to_adj if raw and split: raise DataException('No split raw data. Use of hardcalls is recommended.') if non_refs_only: mt = hl.read_matrix_table(get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version)) else: mt = hl.read_matrix_table(get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version)) if adj: mt = filter_to_adj(mt) if meta_root: meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta) mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]}) if duplicate_mapping_root: dup_ht = hl.import_table(genomes_exomes_duplicate_ids_tsv_path, impute=True, key='exome_id' if data_type == "exomes" else 'genome_id') mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]}) if fam_root: fam_ht = hl.import_fam(fam_path(data_type, fam_version)) mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]}) if release_samples: mt = mt.filter_cols(mt.meta.release) if release_annotations: sites_ht = get_gnomad_public_data(data_type, split) mt = mt.select_rows(**sites_ht[mt.row_key]) mt = mt.select_globals(**sites_ht.index_globals()) return mt
def specific_clumps(filename): clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat}) clump_dict = clump.aggregate(hl.dict(hl.agg.collect( (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)), True) )), _localize=False) return clump_dict
def main(): vshl.init() print("Version info:") for (k, v) in vshl.version_info().items(): print("%s=%s" % (k, v)) data = hl.import_vcf(os.path.join(PROJECT_DIR, 'data/chr22_1000_GRCh38.vcf'), reference_genome='GRCh38') labels = hl.import_table(os.path.join(PROJECT_DIR, 'data/chr22-labels-hail.csv'), delimiter=',', types={'x22_16050408': 'float64'}).key_by('sample') mt = data.annotate_cols(hipster=labels[data.s]) print(mt.count()) rf_model = vshl.random_forest_model(y=mt.hipster.x22_16050408, x=mt.GT.n_alt_alleles(), seed=13, mtry_fraction=0.05, min_node_size=5, max_depth=10) rf_model.fit_trees(100, 50) print("OOB error: %s" % rf_model.oob_error()) impTable = rf_model.variable_importance() impTable.show(3) rf_model.to_json(os.path.join(PROJECT_DIR, "target/chr22_1000_GRCh38-model.json"), True) rf_model.release()
def prepare_gene_results(gene_results_url, genes_url=None): ds = hl.import_table( gene_results_url, missing="", types={ "gene_name": hl.tstr, "gene_id": hl.tstr, "description": hl.tstr, "analysis_group": hl.tstr, "xcase_dn_ptv": hl.tint, "xcont_dn_ptv": hl.tint, "xcase_dn_misa": hl.tint, "xcont_dn_misa": hl.tint, "xcase_dn_misb": hl.tint, "xcont_dn_misb": hl.tint, "xcase_dbs_ptv": hl.tint, "xcont_dbs_ptv": hl.tint, "xcase_swe_ptv": hl.tint, "xcont_swe_ptv": hl.tint, "xcase_tut": hl.tint, "xcont_tut": hl.tint, "qval": hl.tfloat, }, ) ds = ds.rename({"description": "gene_description"}) if genes_url: genes = hl.read_table(genes_url) genes = genes.key_by("gene_id") ds = ds.annotate(chrom=genes[ds.gene_id].chrom, pos=genes[ds.gene_id].start) return ds
def import_SJ_out_tab(path): """ column 1: chromosome column 2: first base of the intron (1-based) column 3: last base of the intron (1-based) column 4: strand (0: undefined, 1: +, 2: -) column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5:AT/AC, 6: GT/AT column 6: 0: unannotated, 1: annotated (only if splice junctions database is used) column 7: number of uniquely mapping reads crossing the junction column 8: number of multi-mapping reads crossing the junction column 9: maximum spliced alignment overhang """ ht = hl.import_table( path, no_header=True, impute=True, force=True, ).rename({ "f0": "chrom", "f1": "start_1based", "f2": "end_1based", "f3": "strand", "f4": "intron_motif", "f5": "known_splice_junction", "f6": "unique_reads", "f7": "multi_mapped_reads", "f8": "maximum_overhang", }) return ht
def compute_prs_mt(genotype_mt_path, prs_mt_path): scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch' clumped = hl.read_table( 'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/' ) sumstats = hl.import_table( 'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz', impute=True) sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos), alleles=hl.array([sumstats.ref, sumstats.alt])) sumstats = sumstats.key_by('locus', 'alleles') sumstats.describe() # mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset # get full genotype mt meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) mt = get_filtered_mt_with_x() mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key])) mt = mt.select_entries('dosage') mt = mt.select_rows() mt = mt.select_cols() mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]), sumstats[mt.row_key].beta_meta, 0)) mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage)) mt_cols = mt.cols() mt_cols = mt_cols.repartition(1000) mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi2 - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def remap_sample_ids(mt, remap_path): """ Remap the MatrixTable's sample ID, 's', field to the sample ID used within seqr, 'seqr_id' If the sample 's' does not have a 'seqr_id' in the remap file, 's' becomes 'seqr_id' :param mt: MatrixTable from VCF :param remap_path: Path to a file with two columns 's' and 'seqr_id' :return: MatrixTable remapped and keyed to use seqr_id """ remap_ht = hl.import_table(remap_path, key='s') missing_samples = remap_ht.anti_join(mt.cols()).collect() remap_count = remap_ht.count() if len(missing_samples) != 0: raise MatrixTableSampleSetError( f'Only {remap_ht.semi_join(mt.cols()).count()} out of {remap_count} ' 'remap IDs matched IDs in the variant callset.\n' f'IDs that aren\'t in the callset: {missing_samples}\n' f'All callset sample IDs:{mt.s.collect()}', missing_samples) mt = mt.annotate_cols(**remap_ht[mt.s]) remap_expr = hl.cond(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id) mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s) mt = mt.key_cols_by(s=mt.seqr_id) logger.info(f'Remapped {remap_count} sample ids...') return mt
def test_de_novo(self): mt = hl.import_vcf(resource('denovo.vcf')) mt = mt.filter_rows(mt.locus.in_y_par(), keep=False) # de_novo_finder doesn't know about y PAR ped = hl.Pedigree.read(resource('denovo.fam')) r = hl.de_novo(mt, ped, mt.info.ESP) r = r.select( prior=r.prior, kid_id=r.proband.s, dad_id=r.father.s, mom_id=r.mother.s, p_de_novo=r.p_de_novo, confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') truth = hl.import_table(resource('denovo.out'), impute=True, comment='#') truth = truth.select( locus=hl.locus(truth['Chr'], truth['Pos']), alleles=[truth['Ref'], truth['Alt']], kid_id=truth['Child_ID'], dad_id=truth['Dad_ID'], mom_id=truth['Mom_ID'], p_de_novo=truth['Prob_dn'], confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id') j = r.join(truth, how='outer') self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
def test_tdt(self): pedigree = hl.Pedigree.read(resource('tdt.fam')) tdt_tab = (hl.transmission_disequilibrium_test( hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)), pedigree)) truth = hl.import_table( resource('tdt_results.tsv'), types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32, 'Chi2': hl.tfloat64, 'Pval': hl.tfloat64}) truth = (truth .transmute(locus=hl.locus(truth.CHROM, truth.POSITION), alleles=[truth.REF, truth.ALT]) .key_by('locus', 'alleles')) if tdt_tab.count() != truth.count(): self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count())) bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False) .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer')) bad.describe() bad = bad.filter(~( (bad.t == bad.T) & (bad.u == bad.U) & (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) & (hl.abs(bad.p_value - bad.Pval) < 0.001))) if bad.count() != 0: bad.order_by(hl.asc(bad.v)).show() self.fail('Found rows in violation of the predicate (see show output)')
def ref_filtering(ref_mt, pass_mt, unrel, outliers, pass_unrel_mt, overwrite: bool = False): mt = hl.read_matrix_table(ref_mt) all_sample_filters = set(mt['sample_filters']) bad_sample_filters = { re.sub('fail_', '', x) for x in all_sample_filters if x.startswith('fail_') } mt_filt = mt.filter_cols(mt['sample_filters']['qc_metrics_filters']. difference(bad_sample_filters).length() == 0) mt_filt = mt_filt.checkpoint(pass_mt, overwrite=False, _read_if_exists=True) mt_unrel = hl.read_matrix_table(unrel) mt_filt = mt_filt.filter_rows( mt_filt.filters.length() == 0) # gnomAD QC pass variants mt_filt = mt_filt.filter_cols(hl.is_defined( mt_unrel.cols()[mt_filt.s])) # only unrelated # remove outliers pca_outliers = hl.import_table(outliers).key_by('s') mt_filt = mt_filt.filter_cols(hl.is_missing(pca_outliers[mt_filt.s])) mt_filt.write(pass_unrel_mt, overwrite)
def get_transcript_lof_metrics_ht() -> hl.Table: return hl.import_table( f"{nfs_dir}/resources/gnomad/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz", min_partitions=100, impute=True, key='transcript' )
def copmute_ldscore(ht, bm_ld, n, radius, out_name, overwrite): r2 = bm_ld**2 r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0)) # This is required, as the squaring/multiplication densifies, so this re-sparsifies. starts_and_stops = hl.linalg.utils.locus_windows(ht.locus, radius, _localize=False) r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops, blocks_only=False) r2_adj = r2_adj.sparsify_triangle() r2_adj = checkpoint_tmp(r2_adj) # Note that the original ld matrix is triangular l2row = checkpoint_tmp(r2_adj.sum(axis=0)).T l2col = checkpoint_tmp(r2_adj.sum(axis=1)) r2_diag = checkpoint_tmp(r2_adj.diagonal()).T l2 = l2row + l2col - r2_diag l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_gs_temp_path() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'}) ht_scores = ht_scores.key_by('idx') ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]).drop('idx') ht = ht.checkpoint(out_name, overwrite) return ht
def prepare_gene_results(): ds = hl.import_table( pipeline_config.get("ASC", "gene_results_path"), missing="", types={ "gene_name": hl.tstr, "gene_id": hl.tstr, "description": hl.tstr, "analysis_group": hl.tstr, "xcase_dn_ptv": hl.tint, "xcont_dn_ptv": hl.tint, "xcase_dn_misa": hl.tint, "xcont_dn_misa": hl.tint, "xcase_dn_misb": hl.tint, "xcont_dn_misb": hl.tint, "xcase_dbs_ptv": hl.tint, "xcont_dbs_ptv": hl.tint, "xcase_swe_ptv": hl.tint, "xcont_swe_ptv": hl.tint, "xcase_tut": hl.tint, "xcont_tut": hl.tint, "qval": hl.tfloat, }, ) ds = ds.drop("gene_name", "description") ds = ds.group_by("gene_id").aggregate( group_results=hl.agg.collect(ds.row_value)) ds = ds.annotate(group_results=hl.dict( ds.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) return ds
def get_gnomad_genomes_coverage_ht() -> hl.Table: return hl.import_table( f"{nfs_dir}/resources/gnomad/gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz", min_partitions=1000, impute=True, key='locus' )
def preprocess2(variant_set): print('\n##################') print( 'Starting Pre-processing 2: Filtering variants table (variant_set: ' + variant_set + ')') print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) print('\n##################') variants = hl.read_table( 'gs://nbaya/split/' + variant_set + '_variants.ht') # for hm3: import table hapmap3_variants.tsv' mt = hl.read_matrix_table( 'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt') mt = mt.filter_rows(hl.is_defined( variants[mt.locus, mt.alleles])) #filter to variant_set variants covs = hl.import_table( 'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv', key='s', impute=True, types={'s': hl.tstr}) mt = mt.annotate_cols(**covs[mt.s]) mt = mt.filter_cols(hl.is_defined(mt.PC1), keep=True) mt.write('gs://nbaya/split/ukb31063.' + variant_set + '_variants.gwas_samples_prerepart.mt') print('\n##################') print( 'Finished Pre-processing 2: Filtering variants table (variant_set: ' + variant_set + ')') print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now())) print('\n##################')
def main(args): add_args = {} if args.n_threads is not None: add_args['master'] = f'local[{args.n_threads}]' hl.init(default_reference='GRCh38', log='/load_finngen.log', **add_args) if args.load_single: ht = hl.import_table(args.input_file, impute=True, force_bgz=True, min_partitions=100).rename({'#chrom': 'chrom'}) ht = ht.transmute(locus=hl.locus('chr' + ht.chrom, ht.pos), alleles=[ht.ref, ht.alt]).key_by('locus', 'alleles') ht = ht.transmute(Pvalue=ht.pval).annotate_globals( **json.loads(args.additional_dict)) ht = ht.annotate(**get_vep_formatted_data(args.vep_path)[ht.key]) ht = ht.checkpoint(args.output_ht, overwrite=args.overwrite, _read_if_exists=not args.overwrite) ht = ht.select_globals().annotate(**json.loads(args.additional_dict)) mt = ht.to_matrix_table( ['locus', 'alleles'], ['phenocode'], ['rsids', 'nearest_genes', 'gene', 'annotation'], ['category', 'name', 'n_cases', 'n_controls']) mt.checkpoint(args.output_mt, overwrite=args.overwrite, _read_if_exists=not args.overwrite) if args.combine_all: # all_hts = list(filter(lambda y: y.endswith('.ht'), map(lambda x: x['path'], hl.hadoop_ls(args.input_directory)))) # print(f'Got {len(all_hts)} HTs...') # mt = mwzj_hts_by_tree(all_hts, temp_bucket + '/finngen', ['phenocode'], debug=True) # mt.checkpoint(temp_mt_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite) mt = hl.read_matrix_table(temp_mt_path) mt.naive_coalesce(5000).write(args.output_mt, args.overwrite)
def test_reference_genome_sequence(self): gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file( "test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue( hl.eval( t.all( hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base))) l = hl.locus("a", 7, gr4) self.assertTrue( hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA")) gr4.remove_sequence() assert not gr4.has_sequence() gr4.add_sequence(resource("fake_reference.fasta"), resource("fake_reference.fasta.fai")) assert gr4.has_sequence()
def subset_samples_and_variants(mt, subset_path): """ Subset the MatrixTable to the provided list of samples and to variants present in those samples :param mt: MatrixTable from VCF :param subset_path: Path to a file with a single column 's' :return: MatrixTable subsetted to list of samples """ subset_ht = hl.import_table(subset_path, key='s') subset_count = subset_ht.count() anti_join_ht = subset_ht.anti_join(mt.cols()) anti_join_ht_count = anti_join_ht.count() if anti_join_ht_count != 0: missing_samples = anti_join_ht.s.collect() raise MatrixTableSampleSetError( f'Only {subset_count-anti_join_ht_count} out of {subset_count} ' 'subsetting-table IDs matched IDs in the variant callset.\n' f'IDs that aren\'t in the callset: {missing_samples}\n' f'All callset sample IDs:{mt.s.collect()}', missing_samples) mt = mt.semi_join_cols(subset_ht) mt = mt.filter_rows((hl.agg.count_where(mt.GT.is_non_ref())) > 0) logger.info(f'Finished subsetting samples. Kept {anti_join_ht_count} ' f'out of {mt.count()} samples in vds') return mt
def run_gwas(vcf_file, phenotypes_file, output_file): table = hl.import_table(phenotypes_file, impute=True).key_by('Sample') hl.import_vcf(vcf_file).write('tmp.mt') mt = hl.read_matrix_table('tmp.mt') mt = mt.annotate_cols(pheno=table[mt.s]) downsampled = mt.sample_rows(0.01, seed=11223344) eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT) mt = mt.annotate_cols(scores=pcs[mt.s].scores) gwas = hl.linear_regression_rows( y=mt.pheno.CaffeineConsumption, x=mt.GT.n_alt_alleles(), covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]]) gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles), P=gwas.p_value) gwas = gwas.key_by(gwas.SNP) gwas = gwas.select(gwas.P) gwas.export(f'{output_file}.assoc', header=True) hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
def test_import_table_force_bgz(self): f = new_temp_file(suffix=".bgz") t = hl.utils.range_table(10, 5) t.export(f) f2 = new_temp_file(suffix=".gz") run_command(["cp", uri_path(f), uri_path(f2)]) t2 = hl.import_table(f2, force_bgz=True, impute=True).key_by('idx') self.assertTrue(t._same(t2))
def test_linear_mixed_regression_pass_through(self): x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() p_path = utils.new_temp_file() mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200)) model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path) model.fit(log_gamma=0) mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005)) mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1))) ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev, model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position]) assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
def test_reference_genome_sequence(self): gr3 = ReferenceGenome.read(resource("fake_ref_genome.json")) self.assertEqual(gr3.name, "my_reference_genome") self.assertFalse(gr3.has_sequence()) gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"), resource("fake_reference.fasta.fai"), mt_contigs=["b", "c"], x_contigs=["a"]) self.assertTrue(gr4.has_sequence()) self.assertTrue(gr4.x_contigs == ["a"]) t = hl.import_table(resource("fake_reference.tsv"), impute=True) self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base))) l = hl.locus("a", 7, gr4) self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
def init(doctest_namespace): # This gets run once per process -- must avoid race conditions print("setting up doctest...") olddir = os.getcwd() os.chdir("docs/") doctest_namespace['hl'] = hl doctest_namespace['agg'] = agg if not os.path.isdir("output/"): try: os.mkdir("output/") except OSError: pass files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"] for f in files: if os.path.isdir(f): shutil.rmtree(f) ds = hl.read_matrix_table('data/example.vds') doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)}) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)}, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44}) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() print("finished setting up doctest...") yield os.chdir(olddir)
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table: """Import a GTF file. The GTF file format is identical to the GFF version 2 file format, and so this function can be used to import GFF version 2 files as well. See https://www.ensembl.org/info/website/upload/gff.html for more details on the GTF/GFF2 file format. The :class:`.Table` returned by this function will be keyed by the ``interval`` row field and will include the following row fields: .. code-block:: text 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'interval': interval<> There will also be corresponding fields for every tag found in the attribute field of the GTF file. Note ---- This function will return an ``interval`` field of type :class:`.tinterval` constructed from the ``seqname``, ``start``, and ``end`` fields in the GTF file. This interval is inclusive of both the start and end positions in the GTF file. If the ``reference_genome`` parameter is specified, the start and end points of the ``interval`` field will be of type :class:`.tlocus`. Otherwise, the start and end points of the ``interval`` field will be of type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and ``position`` (type :class:`.tint32`). Furthermore, if the ``reference_genome`` parameter is specified and ``skip_invalid_contigs`` is ``True``, this import function will skip lines in the GTF where ``seqname`` is not consistent with the reference genome specified. Example ------- >>> ht = hl.experimental.import_gtf('data/test.gtf', ... reference_genome='GRCh37', ... skip_invalid_contigs=True) >>> ht.describe() # doctest: +NOTEST ---------------------------------------- Global fields: None ---------------------------------------- Row fields: 'source': str 'feature': str 'score': float64 'strand': str 'frame': int32 'gene_type': str 'exon_id': str 'havana_transcript': str 'level': str 'transcript_name': str 'gene_status': str 'gene_id': str 'transcript_type': str 'tag': str 'transcript_status': str 'gene_name': str 'transcript_id': str 'exon_number': str 'havana_gene': str 'interval': interval<locus<GRCh37>> ---------------------------------------- Key: ['interval'] ---------------------------------------- Parameters ---------- path : :obj:`str` File to import. reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. skip_invalid_contigs : :obj:`bool` If ``True`` and `reference_genome` is not ``None``, skip lines where ``seqname`` is not consistent with the reference genome. min_partitions : :obj:`int` or :obj:`None` Minimum number of partitions (passed to import_table). Returns ------- :class:`.Table` """ ht = hl.import_table(path, min_partitions=min_partitions, comment='#', no_header=True, types={'f3': hl.tint, 'f4': hl.tint, 'f5': hl.tfloat, 'f7': hl.tint}, missing='.', delimiter='\t') ht = ht.rename({'f0': 'seqname', 'f1': 'source', 'f2': 'feature', 'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'frame', 'f8': 'attribute'}) ht = ht.annotate(attribute=hl.dict( hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht['attribute'].split('; ')))) attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys())) ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x), ht['attribute'][x]) for x in attributes if x}) if reference_genome: if reference_genome == 'GRCh37': ht = ht.annotate(seqname=ht['seqname'].replace('^chr', '')) else: ht = ht.annotate(seqname=hl.case() .when(ht['seqname'].startswith('HLA'), ht['seqname']) .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', '')) .when(ht['seqname'].startswith('chr'), ht['seqname']) .default('chr' + ht['seqname'])) if skip_invalid_contigs: valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs)) ht = ht.filter(valid_contigs.contains(ht['seqname'])) ht = ht.transmute(interval=hl.locus_interval(ht['seqname'], ht['start'], ht['end'], includes_start=True, includes_end=True, reference_genome=reference_genome)) else: ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']), hl.struct(seqname=ht['seqname'], position=ht['end']), includes_start=True, includes_end=True)) ht = ht.key_by('interval') return ht
def test_export_plink_exprs(self): ds = get_dataset() fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id', 'f4': 'is_female', 'f5': 'pheno'} bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position', 'f3': 'position', 'f4': 'a1', 'f5': 'a2'} # Test default arguments out1 = new_temp_file() hl.export_plink(ds, out1) fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") & (fam1.mat_id == "0") & (fam1.is_female == "0") & (fam1.pheno == "NA"))) self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0"))) # Test non-default FAM arguments out2 = new_temp_file() hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope", mat_id="nada", is_female=True, pheno=False) fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") & (fam2.mat_id == "nada") & (fam2.is_female == "2") & (fam2.pheno == "1"))) # Test quantitative phenotype out3 = new_temp_file() hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s))) fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="") .rename(fam_mapping)) self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") & (fam3.mat_id == "0") & (fam3.is_female == "0") & (fam3.pheno != "0") & (fam3.pheno != "NA"))) # Test non-default BIM arguments out4 = new_temp_file() hl.export_plink(ds, out4, varid="hello", cm_position=100) bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False) .rename(bim_mapping)) self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0"))) # Test call expr out5 = new_temp_file() ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0)) hl.export_plink(ds_call, out5, call=ds_call.gt_fake) ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam') nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref())) self.assertTrue(nerrors == 0) # Test white-space in FAM id expr raises error with self.assertRaisesRegex(TypeError, "has spaces in the following values:"): hl.export_plink(ds, new_temp_file(), mat_id="hello world") # Test white-space in varid expr raises error with self.assertRaisesRegex(FatalError, "no white space allowed:"): hl.export_plink(ds, new_temp_file(), varid="hello world")
def test_ld_score_regression(self): ht_scores = hl.import_table( doctest_resource('ld_score_regression.univariate_ld_scores.tsv'), key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint}) ht_50_irnt = hl.import_table( doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_50_irnt = ht_50_irnt.annotate( chi_squared=ht_50_irnt['Z']**2, n=ht_50_irnt['N'], ld_score=ht_scores[ht_50_irnt['SNP']]['L2'], locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'], ht_scores[ht_50_irnt['SNP']]['BP']), alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]), phenotype='50_irnt') ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'], ht_50_irnt['alleles']) ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'], ht_50_irnt['n'], ht_50_irnt['ld_score'], ht_50_irnt['phenotype']) ht_20160 = hl.import_table( doctest_resource('ld_score_regression.20160.sumstats.tsv'), key='SNP', types={'N': hl.tint, 'Z': hl.tfloat}) ht_20160 = ht_20160.annotate( chi_squared=ht_20160['Z']**2, n=ht_20160['N'], ld_score=ht_scores[ht_20160['SNP']]['L2'], locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'], ht_scores[ht_20160['SNP']]['BP']), alleles=hl.array([ht_20160['A2'], ht_20160['A1']]), phenotype='20160') ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles']) ht_20160 = ht_20160.select(ht_20160['chi_squared'], ht_20160['n'], ht_20160['ld_score'], ht_20160['phenotype']) ht = ht_50_irnt.union(ht_20160) mt = ht.to_matrix_table(row_key=['locus', 'alleles'], col_key=['phenotype'], row_fields=['ld_score'], col_fields=[]) mt_tmp = new_temp_file() mt.write(mt_tmp, overwrite=True) mt = hl.read_matrix_table(mt_tmp) ht_results = hl.experimental.ld_score_regression( weight_expr=mt['ld_score'], ld_score_expr=mt['ld_score'], chi_sq_exprs=mt['chi_squared'], n_samples_exprs=mt['n'], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results['50_irnt']['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results['50_irnt']['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results['50_irnt']['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results['20160']['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results['20160']['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results['20160']['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results['20160']['snp_heritability_standard_error'], 0.0416, places=4) ht = ht_50_irnt.annotate( chi_squared_50_irnt=ht_50_irnt['chi_squared'], n_50_irnt=ht_50_irnt['n'], chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'], n_20160=ht_20160[ht_50_irnt.key]['n']) ht_results = hl.experimental.ld_score_regression( weight_expr=ht['ld_score'], ld_score_expr=ht['ld_score'], chi_sq_exprs=[ht['chi_squared_50_irnt'], ht['chi_squared_20160']], n_samples_exprs=[ht['n_50_irnt'], ht['n_20160']], n_blocks=20, two_step_threshold=5, n_reference_panel_variants=1173569) results = { x['phenotype']: { 'mean_chi_sq': x['mean_chi_sq'], 'intercept_estimate': x['intercept']['estimate'], 'intercept_standard_error': x['intercept']['standard_error'], 'snp_heritability_estimate': x['snp_heritability']['estimate'], 'snp_heritability_standard_error': x['snp_heritability']['standard_error']} for x in ht_results.collect()} self.assertAlmostEqual( results[0]['mean_chi_sq'], 3.4386, places=4) self.assertAlmostEqual( results[0]['intercept_estimate'], 0.7727, places=4) self.assertAlmostEqual( results[0]['intercept_standard_error'], 0.2461, places=4) self.assertAlmostEqual( results[0]['snp_heritability_estimate'], 0.3845, places=4) self.assertAlmostEqual( results[0]['snp_heritability_standard_error'], 0.1067, places=4) self.assertAlmostEqual( results[1]['mean_chi_sq'], 1.5209, places=4) self.assertAlmostEqual( results[1]['intercept_estimate'], 1.2109, places=4) self.assertAlmostEqual( results[1]['intercept_standard_error'], 0.2238, places=4) self.assertAlmostEqual( results[1]['snp_heritability_estimate'], 0.0486, places=4) self.assertAlmostEqual( results[1]['snp_heritability_standard_error'], 0.0416, places=4)
age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals(global_field_1=5, global_field_2=10, pli={'SCN1A': 0.999, 'SONIC': 0.014}, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds.write('data/example.vds', overwrite=True) lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz'))) lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True) lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']]) lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05) lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True) burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds.write('data/example_burden.vds', overwrite=True)
def ld_score(entry_expr, locus_expr, radius, coord_expr=None, annotation_exprs=None, block_size=None) -> Table: """Calculate LD scores. Example ------- >>> # Load genetic data into MatrixTable >>> mt = hl.import_plink(bed='data/ldsc.bed', ... bim='data/ldsc.bim', ... fam='data/ldsc.fam') >>> # Create locus-keyed Table with numeric variant annotations >>> ht = hl.import_table('data/ldsc.annot', ... types={'BP': hl.tint, ... 'binary': hl.tfloat, ... 'continuous': hl.tfloat}) >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) >>> ht = ht.key_by('locus') >>> # Annotate MatrixTable with external annotations >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary, ... continuous_annotation=ht[mt.locus].continuous) >>> # Calculate LD scores using centimorgan coordinates >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(), ... locus_expr=mt.locus, ... radius=1.0, ... coord_expr=mt.cm_position, ... annotation_exprs=[mt.binary_annotation, ... mt.continuous_annotation]) >>> # Show results >>> ht_scores.show(3) .. code-block:: text +---------------+-------------------+-----------------------+-------------+ | locus | binary_annotation | continuous_annotation | univariate | +---------------+-------------------+-----------------------+-------------+ | locus<GRCh37> | float64 | float64 | float64 | +---------------+-------------------+-----------------------+-------------+ | 20:82079 | 1.15183e+00 | 7.30145e+01 | 1.60117e+00 | | 20:103517 | 2.04604e+00 | 2.75392e+02 | 4.69239e+00 | | 20:108286 | 2.06585e+00 | 2.86453e+02 | 5.00124e+00 | +---------------+-------------------+-----------------------+-------------+ Warning ------- :func:`.ld_score` will fail if ``entry_expr`` results in any missing values. The special float value ``nan`` is not considered a missing value. **Further reading** For more in-depth discussion of LD scores, see: - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__ Notes ----- `entry_expr`, `locus_expr`, `coord_expr` (if specified), and `annotation_exprs` (if specified) must come from the same MatrixTable. Parameters ---------- entry_expr : :class:`.NumericExpression` Expression for entries of genotype matrix (e.g. ``mt.GT.n_alt_alleles()``). locus_expr : :class:`.LocusExpression` Row-indexed locus expression. radius : :obj:`int` or :obj:`float` Radius of window for row values (in units of `coord_expr` if set, otherwise in units of basepairs). coord_expr: :class:`.Float64Expression`, optional Row-indexed numeric expression for the row value used to window variants. By default, the row value is given by the locus position. annotation_exprs : :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression`, optional Annotation expression(s) to partition LD scores. Univariate annotation will always be included and does not need to be specified. block_size : :obj:`int`, optional Block size. Default given by :meth:`.BlockMatrix.default_block_size`. Returns ------- :class:`.Table` Table keyed by `locus_expr` with LD scores for each variant and `annotation_expr`. The function will always return LD scores for the univariate (all SNPs) annotation.""" mt = entry_expr._indices.source mt_locus_expr = locus_expr._indices.source if coord_expr is None: mt_coord_expr = mt_locus_expr else: mt_coord_expr = coord_expr._indices.source if not annotation_exprs: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr]) else: check_mts = all([mt == mt_locus_expr, mt == mt_coord_expr] + [mt == x._indices.source for x in wrap_to_list(annotation_exprs)]) if not check_mts: raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr (if specified), and annotation_exprs (if specified) must come from same MatrixTable.""") n = mt.count_cols() r2 = hl.row_correlation(entry_expr, block_size) ** 2 r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0)) starts, stops = hl.linalg.utils.locus_windows(locus_expr, radius, coord_expr) r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops) r2_adj_sparse_tmp = new_temp_file() r2_adj_sparse.write(r2_adj_sparse_tmp) r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp) if not annotation_exprs: cols = ['univariate'] col_idxs = {0: 'univariate'} l2 = r2_adj_sparse.sum(axis=1) else: ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows() ht = ht.annotate(univariate=hl.literal(1.0)) names = [name for name in ht.row if name not in ht.key] ht_union = hl.Table.union( *[(ht.annotate(name=hl.str(x), value=hl.float(ht[x])) .select('name', 'value')) for x in names]) mt_annotations = ht_union.to_matrix_table( row_key=list(ht_union.key), col_key=['name']) cols = mt_annotations.key_cols_by()['name'].collect() col_idxs = {i: cols[i] for i in range(len(cols))} a_tmp = new_temp_file() BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp) a = BlockMatrix.read(a_tmp) l2 = r2_adj_sparse @ a l2_bm_tmp = new_temp_file() l2_tsv_tmp = new_temp_file() l2.write(l2_bm_tmp, force_row_major=True) BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp) ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True) ht_scores = ht_scores.add_index() ht_scores = ht_scores.key_by('idx') ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i] for i in range(len(cols))}) ht = mt.select_rows(__locus=locus_expr).rows() ht = ht.add_index() ht = ht.annotate(**ht_scores[ht.idx]) ht = ht.key_by('__locus') ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key]) ht = ht.rename({'__locus': 'locus'}) return ht
def test_linear_mixed_model_fastlmm(self): # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt: # https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth # # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples) # # Results are computed with single_snp (with LOCO) as in: # https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb n, m = 250, 1000 # per chromosome x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1') y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1') mt = hl.import_plink(bed=resource('fastlmmTest.bed'), bim=resource('fastlmmTest.bim'), fam=resource('fastlmmTest.fam'), reference_genome=None) mt = mt.annotate_cols(x=x_table[mt.col_key].f2) mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache() x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T y = np.array(mt.key_cols_by()['y'].collect()) mt_chr1 = mt.filter_rows(mt.locus.contig == '1') mt_chr3 = mt.filter_rows(mt.locus.contig == '3') # testing chrom 1 for h2, betas, p-values h2_fastlmm = 0.14276125 beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170] # FastLMM p-values do not agree to high precision because FastLMM regresses # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2 # (t-test), whereas Hail does likelihood ratio test. # We verify below that Hail's p-values remain fixed going forward. # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059] pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204] gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm) g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.13770773 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005) # first 5 a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T # FastLMM standardizes each variant to have mean 0 and variance 1. a = self._filter_and_standardize_cols(a) * np.sqrt(n) pa = p @ a model.fit(log_gamma=np.log(gamma_fastlmm)) res = model.fit_alternatives_numpy(pa, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # low rank ld = g_std.T @ g_std sl, v = np.linalg.eigh(ld) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error) model.fit(log_gamma=np.log(gamma_fastlmm)) pa = p @ a res = model.fit_alternatives_numpy(pa, a, return_pandas=True) assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) a_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True) pa_t_path = utils.new_temp_file(suffix='bm') BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True) res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas() assert np.allclose(res['beta'], beta_fastlmm) assert np.allclose(res['p_value'], pval_hail) # testing chrom 3 for h2 h2_fastlmm = 0.36733240 g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T g_std = self._filter_and_standardize_cols(g) # full rank k = (g_std @ g_std.T) * (n / m) s, u = np.linalg.eigh(k) p = u.T model = LinearMixedModel(p @ y, p @ x, s) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) h2_std_error = 0.17409641 # hard coded having checked against plot assert np.isclose(model.h_sq_standard_error, h2_std_error) h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1] argmax = int(100 * h2_fastlmm) assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1 assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0) # low rank l = g_std.T @ g_std sl, v = np.linalg.eigh(l) n_eigenvectors = int(np.sum(sl > 1e-10)) assert n_eigenvectors < n sl = sl[-n_eigenvectors:] v = v[:, -n_eigenvectors:] s = sl * (n / m) p = (g_std @ (v / np.sqrt(sl))).T model = LinearMixedModel(p @ y, p @ x, s, y, x) model.fit() assert np.isclose(model.h_sq, h2_fastlmm) assert np.isclose(model.h_sq_standard_error, h2_std_error)
def test_ld_score(self): ht = hl.import_table(doctest_resource('ldsc.annot'), types={'BP': hl.tint, 'CM': hl.tfloat, 'binary': hl.tint, 'continuous': hl.tfloat}) ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP)) ht = ht.key_by('locus') mt = hl.import_plink(bed=doctest_resource('ldsc.bed'), bim=doctest_resource('ldsc.bim'), fam=doctest_resource('ldsc.fam')) mt = mt.annotate_rows(binary=ht[mt.locus].binary, continuous=ht[mt.locus].continuous) ht_univariate = hl.experimental.ld_score( entry_expr=mt.GT.n_alt_alleles(), locus_expr=mt.locus, radius=1.0, coord_expr=mt.cm_position) ht_annotated = hl.experimental.ld_score( entry_expr=mt.GT.n_alt_alleles(), locus_expr=mt.locus, radius=1.0, coord_expr=mt.cm_position, annotation_exprs=[mt.binary, mt.continuous]) univariate = ht_univariate.aggregate(hl.struct( chr20=hl.agg.filter( (ht_univariate.locus.contig == '20') & (ht_univariate.locus.position == 82079), hl.agg.collect(ht_univariate.univariate))[0], chr22 =hl.agg.filter( (ht_univariate.locus.contig == '22') & (ht_univariate.locus.position == 16894090), hl.agg.collect(ht_univariate.univariate))[0], mean=hl.agg.mean(ht_univariate.univariate))) self.assertAlmostEqual(univariate.chr20, 1.601, places=3) self.assertAlmostEqual(univariate.chr22, 1.140, places=3) self.assertAlmostEqual(univariate.mean, 3.507, places=3) annotated = ht_annotated.aggregate( hl.struct( chr20=hl.struct(binary=hl.agg.filter( (ht_annotated.locus.contig == '20') & (ht_annotated.locus.position == 82079), hl.agg.collect(ht_annotated.binary))[0], continuous=hl.agg.filter( (ht_annotated.locus.contig == '20') & (ht_annotated.locus.position == 82079), hl.agg.collect(ht_annotated.continuous))[0]), chr22=hl.struct( binary=hl.agg.filter( (ht_annotated.locus.contig == '22') & (ht_annotated.locus.position == 16894090), hl.agg.collect(ht_annotated.binary))[0], continuous=hl.agg.filter( (ht_annotated.locus.contig == '22') & (ht_annotated.locus.position == 16894090), hl.agg.collect(ht_annotated.continuous))[0]), mean_stats=hl.struct(binary=hl.agg.mean(ht_annotated.binary), continuous=hl.agg.mean(ht_annotated.continuous)))) self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3) self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3) self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3) self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3) self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3) self.assertAlmostEqual(annotated.mean_stats.continuous, 176.528, places=3)
def test_read_back_same_as_exported(self): t, _ = create_all_values_datasets() tmp_file = new_temp_file(prefix="test", suffix=".tsv") t.export(tmp_file) t_read_back = hl.import_table(tmp_file, types=dict(t.row.dtype)).key_by('idx') self.assertTrue(t.select_globals()._same(t_read_back, tolerance=1e-4, absolute=True))
def get_movie_lens(output_dir, overwrite: bool = False): """Download public Movie Lens dataset. Notes ----- The download is about 6M. See the `MovieLens website <https://grouplens.org/datasets/movielens/100k/>`__ for more information about this dataset. Parameters ---------- output_dir Directory in which to write data. overwrite If ``True``, overwrite existing files/directories at those locations. """ jhc = Env.hc()._jhc _mkdir(jhc, output_dir) paths = [os.path.join(output_dir, x) for x in ['movies.ht', 'ratings.ht', 'users.ht']] if overwrite or any(not Env.jutils().dirExists(jhc, f) for f in paths): init_temp_dir() source = resources['movie_lens_100k'] tmp_path = os.path.join(tmp_dir, 'ml-100k.zip') info(f'downloading MovieLens-100k data ...\n' f' Source: {source}') urlretrieve(source, tmp_path) with zipfile.ZipFile(tmp_path, 'r') as z: z.extractall(tmp_dir) user_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.user')) movie_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.item')) ratings_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.data')) assert (os.path.exists(user_table_path)) assert (os.path.exists(movie_table_path)) assert (os.path.exists(ratings_table_path)) user_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(user_table_path), 'txt') movie_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(movie_table_path), 'txt') ratings_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(ratings_table_path), 'txt') [movies_path, ratings_path, users_path] = paths genres = ['Action', 'Adventure', 'Animation', "Children's", 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] # utility functions for importing movies def field_to_array(ds, field): return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr)) def fields_to_array(ds, fields): return hl.flatten(hl.array([field_to_array(ds, f) for f in fields])) def rename_columns(ht, new_names): return ht.rename({k: v for k, v in zip(ht.row, new_names)}) info(f'importing users table and writing to {users_path} ...') users = rename_columns( hl.import_table(user_cluster_readable, key=['f0'], no_header=True, impute=True, delimiter='|'), ['id', 'age', 'sex', 'occupation', 'zipcode']) users.write(users_path, overwrite=True) info(f'importing movies table and writing to {movies_path} ...') movies = hl.import_table(movie_cluster_readable, key=['f0'], no_header=True, impute=True, delimiter='|') movies = rename_columns(movies, ['id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown'] + genres) movies = movies.drop('release date', 'video release date', 'unknown', 'IMDb URL') movies = movies.transmute(genres=fields_to_array(movies, genres)) movies.write(movies_path, overwrite=True) info(f'importing ratings table and writing to {ratings_path} ...') ratings = hl.import_table(ratings_cluster_readable, no_header=True, impute=True) ratings = rename_columns(ratings, ['user_id', 'movie_id', 'rating', 'timestamp']) ratings = ratings.drop('timestamp') ratings.write(ratings_path, overwrite=True) else: info('Movie Lens files found!')