def _create(self, resource_dir): tsv = 'random_doubles_mt.tsv.bgz' download(resource_dir, tsv) logging.info(f"downloading {tsv}") local_tsv = os.path.join(resource_dir, tsv) hl.import_matrix_table(local_tsv, row_key="row_idx", row_fields={"row_idx": hl.tint32}, entry_type=hl.tfloat64) \ .write(os.path.join(resource_dir, "random_doubles_mt.mt"))
def test_import_matrix_table(self): mt = hl.import_matrix_table(doctest_resource('matrix1.tsv'), row_fields={'Barcode': hl.tstr, 'Tissue': hl.tstr, 'Days': hl.tfloat32}) self.assertEqual(mt['Barcode']._indices, mt._row_indices) self.assertEqual(mt['Tissue']._indices, mt._row_indices) self.assertEqual(mt['Days']._indices, mt._row_indices) self.assertEqual(mt['col_id']._indices, mt._col_indices) self.assertEqual(mt['row_id']._indices, mt._row_indices) mt.count() row_fields = {'f0': hl.tstr, 'f1': hl.tstr, 'f2': hl.tfloat32} hl.import_matrix_table(doctest_resource('matrix2.tsv'), row_fields=row_fields, row_key=[]).count() hl.import_matrix_table(doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True).count() hl.import_matrix_table(doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True, row_key=[]).count() self.assertRaises(hl.utils.FatalError, hl.import_matrix_table, doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True, row_key=['foo'])
def test_import_matrix_table(self): mt = hl.import_matrix_table(doctest_resource('matrix1.tsv'), row_fields={'Barcode': hl.tstr, 'Tissue': hl.tstr, 'Days': hl.tfloat32}) self.assertEqual(mt['Barcode']._indices, mt._row_indices) self.assertEqual(mt['Tissue']._indices, mt._row_indices) self.assertEqual(mt['Days']._indices, mt._row_indices) self.assertEqual(mt['col_id']._indices, mt._col_indices) self.assertEqual(mt['row_id']._indices, mt._row_indices) mt.count() row_fields = {'f0': hl.tstr, 'f1': hl.tstr, 'f2': hl.tfloat32} hl.import_matrix_table(doctest_resource('matrix2.tsv'), row_fields=row_fields, row_key=[]).count() hl.import_matrix_table(doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True).count() hl.import_matrix_table(doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True, row_key=[]).count() self.assertRaises(hl.utils.FatalError, hl.import_matrix_table, doctest_resource('matrix3.tsv'), row_fields=row_fields, no_header=True, row_key=['foo'])
def populate_gtex(): meta_ht = hl.import_table( '/home/ml2529/gtex_data/GTEx_v7_Annotations_SampleAttributesDS.txt', delimiter='\t', key='SAMPID') mt = hl.import_matrix_table('/home/ml2529/gtex_data/ENSG00000177732.tsv', row_key='transcript_id', row_fields={ 'transcript_id': hl.tstr, 'gene_id': hl.tstr }, entry_type=hl.tfloat32) #mt = hl.import_matrix_table('/home/ml2529/gtex_data/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_tpm.txt.bgz', row_key='transcript_id', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr},entry_type=hl.tfloat32) mt = mt.rename({'transcript_id': 'transcriptId', 'gene_id': 'geneId'}) #pprint.pprint(meta_ht.describe()) #pprint.pprint(gtex_mt.describe()) mt = mt.annotate_cols(tissue=meta_ht[mt.col_id].SMTSD) #pprint.pprint(mt.describe()) #pprint.pprint(mt.show(include_row_fields=True)) cut_dict = { 'tissue': hl.agg.filter(hl.is_defined(mt.tissue), hl.agg.counter(mt.tissue)) } #pprint.pprint(cut_dict) cut_data = mt.aggregate_cols(hl.struct(**cut_dict)) #pprint.pprint(cut_data.tissue) #call_stats = hl.agg.filter(mt.tissue == 'Lung', hl.agg.mean(mt.x)) #pprint.pprint(call_stats) #mt = mt.annotate_rows(Lung=call_stats) #pprint.pprint(mt.show(include_row_fields=True)) for x in sorted(cut_data['tissue'].keys()): #pprint.pprint(x) call_stats = hl.agg.filter(mt.tissue == x, hl.agg.mean(mt.x)) mt = mt.transmute_rows(**{f"{tissue_abbr[x]}": call_stats}) #pprint.pprint(mt.show(include_row_fields=True)) ht = mt.rows() #ht.write('gtex_expression.ht',overwrite=True) export_ht_to_es(ht, index_name='gtex_tissue_tpms_by_transcript', index_type='tissue_tpms') '''
def prepare_gtex_expression_data(transcript_tpms_path, sample_annotations_path, tmp_path): # Recompress tpms file with block gzip so that import_matrix_table will read the file ds = hl.import_table(transcript_tpms_path, force=True) tmp_transcript_tpms_path = tmp_path + "/" + transcript_tpms_path.split( "/")[-1].replace(".gz", ".bgz") ds.export(tmp_transcript_tpms_path) # Import data ds = hl.import_matrix_table( tmp_transcript_tpms_path, row_fields={ "transcript_id": hl.tstr, "gene_id": hl.tstr }, entry_type=hl.tfloat, ) ds = ds.rename({"col_id": "sample_id"}) ds = ds.repartition(1000, shuffle=True) samples = hl.import_table(sample_annotations_path, key="SAMPID") # Separate version numbers from transcript and gene IDs ds = ds.annotate_rows( transcript_id=ds.transcript_id.split(r"\.")[0], transcript_version=hl.int(ds.transcript_id.split(r"\.")[1]), gene_id=ds.gene_id.split(r"\.")[0], gene_version=hl.int(ds.gene_id.split(r"\.")[1]), ) # Annotate columns with the tissue the sample came from ds = ds.annotate_cols(tissue=samples[ds.sample_id].SMTSD) # Collect expression into median across all samples in each tissue ds = ds.group_cols_by(ds.tissue).aggregate(**{ "": hl.agg.approx_median(ds.x) }).make_table() # Format tissue names other_fields = { "transcript_id", "transcript_version", "gene_id", "gene_version" } tissues = [f for f in ds.row_value.dtype.fields if f not in other_fields] ds = ds.transmute(tissues=hl.struct( **{format_tissue_name(tissue): ds[tissue] for tissue in tissues})) ds = ds.key_by("transcript_id").drop("row_id") return ds
def get_gtex_summary(gtex_rsem_path, gtex_tx_summary_out_path, get_medians=True): """ Get GTEx RSEM table with ENSTs and ENSGs as rows and GTEx samples as columns (e.g. Muscle-Skeletal.12, Adipose.27 etc.) and write out a table with same rows, and tissues as columns (Muscle-Skeletal, Adipose etc.) with cells representing summary expression of transcripts across tissues (ie. mean or median). :param str gtex_rsem_path: Output of RSEM quantifications from GTEx Example: "gs://gnomad-berylc/reheadered.GTEx_Analysis_2016-09-07_RSEMv1.2.22_transcript_tpm.txt.bgz" :param str gtex_tx_summary_out_path: Path to write out. Example: "gs://gnomad-berylc/tx-annotation/hail2/GTEx.V7.tx_medians.030818.mt" :param bool get_medians: Default True. If False, returns mean transcript expression per tissue :return: Writes out summarized GTEx transcript expression as Table. :rtype: None """ gtex = hl.import_matrix_table(gtex_rsem_path, row_key='transcript_id', row_fields={ 'transcript_id': hl.tstr, 'gene_id': hl.tstr }, entry_type=hl.tfloat64) gtex = gtex.annotate_cols(tissue=gtex.col_id.split("\\.")[0]) if get_medians: gtex = gtex.group_cols_by(gtex.tissue).aggregate( median_tx_expr=hl.median(agg.collect(gtex.x))) else: gtex = gtex.group_cols_by( gtex.tissue).aggregate(mean_tx_expr=hl.mean(agg.collect(gtex.x))) # Make a new column as an array of the values across tissues (per transcript) gtex = gtex.annotate_rows(agg_expression=agg.collect(gtex.median_tx_expr)) # Modify the gtex table to remove version numbers gtex = gtex.annotate_rows(transcript_id=gtex.transcript_id.split("\\.")[0]) gtex = gtex.annotate_rows(gene_id=gtex.gene_id.split("\\.")[0]) gtex.write(gtex_tx_summary_out_path, overwrite=True)
def generate_datasets(doctest_namespace): doctest_namespace['hl'] = hl doctest_namespace['np'] = np ds = hl.import_vcf('data/sample.vcf.bgz') ds = ds.sample_rows(0.03) ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5), panel_maf=0.1, anno1=5, anno2=0, consequence="LOF", gene="A", score=5.0) ds = ds.annotate_rows(a_index=1) ds = hl.sample_qc(hl.variant_qc(ds)) ds = ds.annotate_cols(is_case=True, pheno=hl.struct(is_case=hl.rand_bool(0.5), is_female=hl.rand_bool(0.5), age=hl.rand_norm(65, 10), height=hl.rand_norm(70, 10), blood_pressure=hl.rand_norm(120, 20), cohort_name="cohort1"), cov=hl.struct(PC1=hl.rand_norm(0, 1)), cov1=hl.rand_norm(0, 1), cov2=hl.rand_norm(0, 1), cohort="SIGMA") ds = ds.annotate_globals( global_field_1=5, global_field_2=10, pli={ 'SCN1A': 0.999, 'SONIC': 0.014 }, populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS']) ds = ds.annotate_rows(gene=['TTN']) ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS') ds = ds.checkpoint(f'output/example.mt', overwrite=True) doctest_namespace['ds'] = ds doctest_namespace['dataset'] = ds doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5) doctest_namespace['dataset_to_union_1'] = ds doctest_namespace['dataset_to_union_2'] = ds v_metadata = ds.rows().annotate_globals(global_field=5).annotate( consequence='SYN') doctest_namespace['v_metadata'] = v_metadata s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F') doctest_namespace['s_metadata'] = s_metadata doctest_namespace['cols_to_keep'] = s_metadata doctest_namespace['cols_to_remove'] = s_metadata doctest_namespace['rows_to_keep'] = v_metadata doctest_namespace['rows_to_remove'] = v_metadata small_mt = hl.balding_nichols_model(3, 4, 4) doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt', overwrite=True) # Table table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID') table1 = table1.annotate_globals(global_field_1=5, global_field_2=10) doctest_namespace['table1'] = table1 doctest_namespace['other_table'] = table1 table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID') doctest_namespace['table2'] = table2 table4 = hl.import_table('data/kt_example4.tsv', impute=True, types={ 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr), 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32), 'E': hl.tstruct(A=hl.tint32, B=hl.tint32) }) doctest_namespace['table4'] = table4 people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+', types={ 'Age': hl.tint32, 'Children': hl.tarray(hl.tstr) }, key='Name') doctest_namespace['people_table'] = people_table # TDT doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf') ds2 = hl.variant_qc(ds) doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF) # Expressions doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie']) doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5]) doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1]) doctest_namespace['t'] = hl.literal(True) doctest_namespace['f'] = hl.literal(False) doctest_namespace['na'] = hl.null(hl.tbool) doctest_namespace['call'] = hl.call(0, 1, phased=False) doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5]) doctest_namespace['d'] = hl.literal({ 'Alice': 43, 'Bob': 33, 'Charles': 44 }) doctest_namespace['interval'] = hl.interval(3, 11) doctest_namespace['locus_interval'] = hl.parse_locus_interval( "1:53242-90543") doctest_namespace['locus'] = hl.locus('1', 1034245) doctest_namespace['x'] = hl.literal(3) doctest_namespace['y'] = hl.literal(4.5) doctest_namespace['s1'] = hl.literal({1, 2, 3}) doctest_namespace['s2'] = hl.literal({1, 3, 5}) doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'}) doctest_namespace['struct'] = hl.struct(a=5, b='Foo') doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3])) doctest_namespace['s'] = hl.literal('The quick brown fox') doctest_namespace['interval2'] = hl.Interval(3, 6) doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]]) # Overview doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True) doctest_namespace['mt'] = ds gnomad_data = ds.rows() doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF) # BGEN bgen = hl.import_bgen('data/example.8bits.bgen', entry_fields=['GT', 'GP', 'dosage']) doctest_namespace['variants_table'] = bgen.rows() burden_ds = hl.import_vcf('data/example_burden.vcf') burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True) burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s]) burden_ds = burden_ds.annotate_rows( weight=hl.float64(burden_ds.locus.position)) burden_ds = hl.variant_qc(burden_ds) genekt = hl.import_locus_intervals('data/gene.interval_list') burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus]) burden_ds = burden_ds.checkpoint(f'output/example_burden.vds', overwrite=True) doctest_namespace['burden_ds'] = burden_ds ld_score_one_pheno_sumstats = hl.import_table( 'data/ld_score_regression.one_pheno.sumstats.tsv', types={ 'locus': hl.tlocus('GRCh37'), 'alleles': hl.tarray(hl.tstr), 'chi_squared': hl.tfloat64, 'n': hl.tint32, 'ld_score': hl.tfloat64, 'phenotype': hl.tstr, 'chi_squared_50_irnt': hl.tfloat64, 'n_50_irnt': hl.tint32, 'chi_squared_20160': hl.tfloat64, 'n_20160': hl.tint32 }, key=['locus', 'alleles']) doctest_namespace[ 'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats mt = hl.import_matrix_table( 'data/ld_score_regression.all_phenos.sumstats.tsv', row_fields={ 'locus': hl.tstr, 'alleles': hl.tstr, 'ld_score': hl.tfloat64 }, entry_type=hl.tstr) mt = mt.key_cols_by(phenotype=mt.col_id) mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus), alleles=mt.alleles.split(',')) mt = mt.drop('row_id', 'col_id') mt = mt.annotate_entries(x=mt.x.split(",")) mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]), n=hl.int32(mt.x[1])) mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score)) doctest_namespace['ld_score_all_phenos_sumstats'] = mt print("finished setting up doctest...")
import hail as hl root = 'gs://hail-datasets-raw-data/LDSC/baselineLD_v2.2' mt = hl.import_matrix_table(f'{root}/ld_scores.GRCh37.tsv.bgz', row_fields={'CHR': hl.tstr, 'SNP': hl.tstr, 'BP': hl.tint}, entry_type=hl.tstr) mt = mt.annotate_entries(x=hl.float(mt['x'])) mt = mt.annotate_rows( locus=hl.locus(mt['CHR'], mt['BP'], 'GRCh37')) mt = mt.key_rows_by('locus') mt = mt.select_rows('SNP') M = hl.import_table( f'{root}/M.GRCh37.tsv.bgz', key='annotation') M_5_50 = hl.import_table( f'{root}/M_5_50.GRCh37.tsv.bgz', key='annotation') mt = mt.rename({'col_id': 'annotation'}) mt = mt.annotate_cols( M_5_50=hl.int(hl.float(M_5_50[mt.annotation].M_5_50)), M=hl.int(hl.float(M[mt.annotation].M))) n_rows, n_cols = mt.count() n_partitions = mt.n_partitions() mt = mt.annotate_globals( metadata=hl.struct( name='LDSC_baselineLD_v2.2_ld_scores', reference_genome='GRCh37',
def main(args): input_tsv = args.input_tsv output_ht = args.output_ht chunk_size = args.chunk_size overwrite = args.overwrite mt_list = [] logger.info( "Reading in individual coverage files as matrix tables and adding to a list of matrix tables..." ) with open(input_tsv, "r") as f: #next(f) for line in f: line = line.rstrip() items = line.split("\t") sample, base_level_coverage_metrics = items[0:2] #print(sample) #print(base_level_coverage_metrics) mt = hl.import_matrix_table( base_level_coverage_metrics, delimiter="\t", row_fields={ "chrom": hl.tstr, "pos": hl.tint, "target": hl.tstr }, row_key=["chrom", "pos"], ).drop("target") mt = mt.rename({"x": "coverage"}) mt = mt.key_cols_by(s=sample) mt_list.append(mt) logger.info("Joining individual coverage mts...") out_dir = dirname(output_ht) temp_out_dir = out_dir + "/temp" cov_mt = multi_way_union_mts(mt_list, temp_out_dir, chunk_size) n_samples = cov_mt.count_cols() logger.info("Adding coverage annotations...") cov_mt = cov_mt.annotate_rows( locus=hl.locus(cov_mt.chrom, cov_mt.pos, reference_genome="GRCh38"), mean=hl.float(hl.agg.mean(cov_mt.coverage)), median=hl.median(hl.agg.collect(cov_mt.coverage)), over_100=hl.float( (hl.agg.count_where(cov_mt.coverage > 100) / n_samples)), over_1000=hl.float( (hl.agg.count_where(cov_mt.coverage > 1000) / n_samples)), ) cov_mt.show() cov_mt = cov_mt.key_rows_by("locus").drop("chrom", "pos") output_mt = re.sub("\.ht$", ".mt", output_ht) output_tsv = re.sub("\.ht$", ".tsv", output_ht) output_samples = re.sub("\.ht$", "_sample_level.txt", output_ht) logger.info("Writing sample level coverage...") sample_mt = cov_mt.key_rows_by(pos=cov_mt.locus.position) sample_mt.coverage.export(output_samples) logger.info("Writing coverage mt and ht...") cov_mt.write(output_mt, overwrite=overwrite) cov_ht = cov_mt.rows() cov_ht = cov_ht.checkpoint(output_ht, overwrite=overwrite) cov_ht.export(output_tsv)
# # Load in the dosage files from Tractor # ## First implementing the GWAS version that includes the full VCF but with dosage calls per ancestry # In[6]: row_fields = { 'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr } anc0dos = hl.import_matrix_table( 'gs://ukb-diverse-pops/AdmixedAfrEur/DosageFiles/UKBB_AfEur_QCed_lipids.autosomes.anc0.dosage_v1.txt.gz', force_bgz=True, row_fields=row_fields, row_key=[], min_partitions=32) anc0dos = anc0dos.key_rows_by().drop('row_id') anc0dos = anc0dos.key_rows_by(locus=hl.locus(anc0dos.CHROM, anc0dos.POS)) # In[7]: row_fields = { 'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr } anc1dos = hl.import_matrix_table(
ht_genes = import_gtf(path=EXTRACT_BUCKET + 'GTEx/v7/GTEx_genes.v7.GRCh37.gtf.bgz', reference_genome='GRCh37') ht_genes = ht_genes.filter(ht_genes['feature'] == 'gene') ht_genes = ht_genes.key_by(ht_genes['gene_id']) ht_genes = ht_genes.select('interval', 'strand', 'gene_name', 'havana_gene', 'gene_type', 'gene_status', 'level', 'tag') ht_genes = ht_genes.rename({'interval': 'gene_interval'}) ht_genes = ht_genes.distinct() mt_counts = hl.import_matrix_table( EXTRACT_BUCKET + 'GTEx/v7/GTEx_gene_read_counts.v7.GRCh37.tsv.bgz', row_fields={ 'Name': hl.tstr, 'Description': hl.tstr }, row_key='Name', missing=' ', entry_type=hl.tfloat) mt_counts = mt_counts.drop('Description') mt_counts = mt_counts.transmute_entries(read_count=hl.int(mt_counts['x'])) mt_counts = mt_counts.rename({'col_id': 'sample_id', 'Name': 'gene_id'}) mt_tpm = hl.import_matrix_table(EXTRACT_BUCKET + 'GTEx/v7/GTEx_gene_tpm.v7.GRCh37.tsv.bgz', row_fields={ 'Name': hl.tstr, 'Description': hl.tstr }, row_key='Name',
# s3 credentials required for user to access the datasets in farm flexible compute s3 environment # you may use your own here from your .s3fg file in your home directory hadoop_config = sc._jsc.hadoopConfiguration() hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') cohorts_pop = hl.import_table( "s3a://DDD-ELGH-UKBB-exomes/ancestry/sanger_cohort_known_populations_ukbb_elgh_labels_updated.tsv", delimiter="\t").key_by('s') # s3a://DDD-ELGH-UKBB-exomes/ancestry/WES_AKT_1kg_intersection.vcf.mt # # overlap AKT dataset overlap_1kg_AKT = hl.import_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/WES_AKT_1kg_intersection.mt") # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt" ) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated) mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) mt.write( f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable
def main(args): bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca, reference_genome='GRCh38') cohorts_pop = hl.import_table(cohorts_populations, delimiter="\t").key_by('s') # # overlap AKT dataset overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap) # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table(args.matrixtable) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated) # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) mt.write( f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1])) mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail') # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm. mt_vqc_filtered = mt_vqc.filter_rows( (mt_vqc.variant_QC_Hail.call_rate >= 0.99) & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05) & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: # overlap_1kg_AKT # mt_1kg_chr1_chr20 = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt") overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus") mt_vqc_filtered = mt_vqc_filtered.filter_rows( hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus])) logger.info("done filtering writing mt") # ld pruning pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) #pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) # remove pruned areas that need to be removed # autosomes only: pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome()) pruned_mt.write( f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt", overwrite=True) # pruned_mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # related_samples_to_drop = hl.read_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht") logger.info("run_pca_with_relateds") # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds( # pruned_mt, related_samples_to_drop, autosomes_only=True) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True) pca_scores = pca_scores.annotate( known_pop=pruned_mt.cols()[pca_scores.s].known_pop) # mt = mt.annotate_cols( # loadings=pca_loadings[mt_vqc_filtered.col_key].loadings) # mt = mt.annotate_cols(known_pop="unk") # pca_scores = pca_scores.annotate(known_pop="unk") pca_scores.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht", overwrite=True) pca_loadings.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht", overwrite=True) with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt", 'w') as f: for val in pca_evals: f.write(str(val)) logger.info("assign population pcs") pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.scores, known_col="known_pop", n_estimators=100, prop_train=0.8, min_prob=0.5) pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht", overwrite=True) pop_ht.export( f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")
'f3': 'start', 'f4': 'end', 'f5': 'score', 'f6': 'strand', 'f7': 'phase', 'f8': 'attributes'}) ht_transcripts = ht_transcripts.filter(ht_transcripts.feature_type == 'transcript') ht_transcripts = ht_transcripts.annotate(interval=hl.interval(hl.locus(ht_transcripts.contig, ht_transcripts.start, 'GRCh37'), hl.locus(ht_transcripts.contig, ht_transcripts.end + 1, 'GRCh37'))) ht_transcripts = ht_transcripts.annotate(attributes=hl.dict(hl.map(lambda x: (x.split(' ')[0], x.split(' ')[1].replace('"', '').replace(';$', '')), ht_transcripts.attributes.split('; ')))) attribute_cols = list(ht_transcripts.aggregate(hl.set(hl.flatten(hl.agg.collect(ht_transcripts.attributes.keys()))))) ht_transcripts = ht_transcripts.annotate(**{x: hl.or_missing(ht_transcripts.attributes.contains(x), ht_transcripts.attributes[x]) for x in attribute_cols}) ht_transcripts = ht_transcripts.select(*(['transcript_id', 'transcript_name', 'transcript_type', 'strand', 'transcript_status', 'havana_transcript', 'ccdsid', 'ont', 'gene_name', 'interval', 'gene_type', 'annotation_source', 'havana_gene', 'gene_status', 'tag'])) ht_transcripts = ht_transcripts.rename({'havana_transcript': 'havana_transcript_id', 'havana_gene': 'havana_gene_id'}) ht_transcripts = ht_transcripts.key_by(ht_transcripts.transcript_id) mt = hl.import_matrix_table('gs://hail-datasets/raw-data/gtex/v7/rna-seq/processed/GTEx_Analysis_2016-01-15_v7_RSEMv1.2.22_transcript_expected_count.tsv.bgz', row_fields={'transcript_id': hl.tstr, 'gene_id': hl.tstr}, row_key='transcript_id', missing='', entry_type=hl.tfloat) mt = mt.annotate_cols(sample_id=mt.col_id) mt = mt.key_cols_by(mt.sample_id) mt = mt.annotate_entries(read_count=hl.int(mt.x)) mt = mt.drop(mt.col_id, mt.x) mt = mt.annotate_cols(**ht_samples[mt.sample_id]) mt = mt.annotate_rows(**ht_transcripts[mt.transcript_id]) mt.describe() mt.write('gs://hail-datasets/hail-data/gtex_v7_transcript_read_counts.GRCh37.mt', overwrite=True)
'havana_gene', 'gene_type', 'gene_status', 'level', 'score', 'strand', 'frame', 'tag') ht_genes = ht_genes.rename({ 'gene_name': 'gene_symbol', 'havana_gene': 'havana_gene_id' }) ht_genes.write('hdfs:///tmp/genes.ht', overwrite=True) ht_genes = hl.read_table('hdfs:///tmp/genes.ht') # gene read counts name = 'GTEx_RNA_seq_gene_read_counts' mt = hl.import_matrix_table( f'{raw_data_root}/GTEx_v7_RNA_seq_gene_read_counts.tsv.bgz', row_fields={ 'Name': hl.tstr, 'Description': hl.tstr }, row_key='Name', entry_type=hl.tstr, missing=' ') mt = mt.select_entries(read_count=hl.int(hl.float(mt.x))) mt = mt.rename({ 'Name': 'gene_id', 'Description': 'gene_symbol', 'col_id': 's' }) mt = mt.annotate_cols(subject_id=hl.delimit(mt['s'].split('-')[:2], '-')) mt = mt.annotate_cols(**ht_samples[mt.s]) mt = mt.annotate_cols(**ht_subjects[mt.subject_id]) mt = mt.annotate_rows(**ht_genes[mt.gene_id])
hl.init() #load in plotting features from hail.plot import show from pprint import pprint hl.plot.output_notebook() # # Load in the dosage files from Tractor # ### Note: this will be the most time intensive step. Hail team is actively optimizing pieces of this infrastructure. ## user should modify the paths in the import steps to match the location (here shown for files on google cloud) of their datasets #start loading in the ancestry 0 minor allele dosages row_fields={'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr} anc0dos = hl.import_matrix_table('gs://.../Dataset.anc0.dosage.txt.gz', force_bgz=True, row_fields=row_fields, row_key=[], min_partitions=32) anc0dos = anc0dos.key_rows_by().drop('row_id') anc0dos = anc0dos.key_rows_by(locus=hl.locus(anc0dos.CHROM, anc0dos.POS)) #also load ancestry 1 allele dosages row_fields={'CHROM': hl.tstr, 'POS': hl.tint, 'ID': hl.tstr, 'REF': hl.tstr, 'ALT': hl.tstr} anc1dos = hl.import_matrix_table('gs://.../Dataset.anc1.dosage.txt.gz', force_bgz=True, row_fields=row_fields, row_key=[], min_partitions=32) anc1dos = anc1dos.key_rows_by().drop('row_id') anc1dos = anc1dos.key_rows_by(locus=hl.locus(anc1dos.CHROM, anc1dos.POS)) #Optional - save these temporary files to relieve memory burden anc0dos = anc0dos.checkpoint('gs://.../Dataset.anc0.dosage.mt') anc1dos = anc1dos.checkpoint('gs://.../Dataset.anc1.dosage.mt')