# done the above on pca_RF jupyter notebook # mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts.mt") #mt = hl.split_multi_hts( mt, keep_star=False, left_aligned=False) mt.write( f"{tmp_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt # mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-20-XY_new_cohorts_split_multi.mt") mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1])) mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail') # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm. mt_vqc_filtered = mt_vqc.filter_rows( (mt_vqc.variant_QC_Hail.call_rate >= 0.99) & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05) & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: mt_1kg_chr1_chr20 = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt" ) #mt_vqc_filtered1 = mt_vqc_filtered.key_rows_by("locus")
def main(args): bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca, reference_genome='GRCh38') cohorts_pop = hl.import_table(cohorts_populations, delimiter="\t").key_by('s') # # overlap AKT dataset overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap) # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table(args.matrixtable) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated) # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) mt.write( f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1])) mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail') # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm. mt_vqc_filtered = mt_vqc.filter_rows( (mt_vqc.variant_QC_Hail.call_rate >= 0.99) & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05) & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: # overlap_1kg_AKT # mt_1kg_chr1_chr20 = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt") overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus") mt_vqc_filtered = mt_vqc_filtered.filter_rows( hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus])) logger.info("done filtering writing mt") # ld pruning pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) #pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) # remove pruned areas that need to be removed # autosomes only: pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome()) pruned_mt.write( f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt", overwrite=True) # pruned_mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # related_samples_to_drop = hl.read_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht") logger.info("run_pca_with_relateds") # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds( # pruned_mt, related_samples_to_drop, autosomes_only=True) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True) pca_scores = pca_scores.annotate( known_pop=pruned_mt.cols()[pca_scores.s].known_pop) # mt = mt.annotate_cols( # loadings=pca_loadings[mt_vqc_filtered.col_key].loadings) # mt = mt.annotate_cols(known_pop="unk") # pca_scores = pca_scores.annotate(known_pop="unk") pca_scores.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht", overwrite=True) pca_loadings.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht", overwrite=True) with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt", 'w') as f: for val in pca_evals: f.write(str(val)) logger.info("assign population pcs") pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.scores, known_col="known_pop", n_estimators=100, prop_train=0.8, min_prob=0.5) pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht", overwrite=True) pop_ht.export( f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")