def ld_prune(input_mt: hl.MatrixTable, build: str, gnomad_ld: bool) -> hl.MatrixTable: """ LD prune the MatrixTable. :param input_mt: MatrixTable :param build: Build for the input MatrixTable :param gnomad_ld: Whether or not to use LD data from gnomAD dataset for the pruning step :return: ld-pruned MatrixTable """ if gnomad_ld == False: mm_pruned = hl.ld_prune(input_mt.GT, r2=0.1) input_mt = input_mt.filter_rows( hl.is_defined(mm_pruned[input_mt.row_key])) else: # Borrow from gnomAD ld pruning if build == "GRCh37": pruned_mt = hl.read_matrix_table( qc_mt_path("joint", ld_pruned=True)) elif build == "GRCh38": pruned_mt = hl.read_matrix_table(qc.path) input_mt = input_mt.filter_rows( hl.is_defined(pruned_mt.index_rows(input_mt.row_key))) return input_mt
def ld_prune_filter(intersect_out, prune_out, overwrite: bool = False): mt = hl.read_matrix_table(intersect_out) print(mt.count()) mt = hl.variant_qc(mt) mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001) & (mt.variant_qc.AF[0] < 0.999)) print(mt_filt.count()) mt_intersect_prune = hl.ld_prune(mt_filt.GT, r2=0.8, bp_window_size=500000) mt_intersect_pruned = mt_filt.filter_rows( hl.is_defined(mt_intersect_prune[mt_filt.row_key])) mt_intersect_pruned.write(prune_out, overwrite)
def pca_filter_mt(in_mt: hl.MatrixTable, maf: float = 0.05, hwe: float = 1e-3, call_rate: float = 0.98, ld_cor: float = 0.2, ld_window: int = 250000): print("\nInitial number of SNPs before filtering: {}".format( in_mt.count_rows())) mt = hl.variant_qc(in_mt) print(f'\nFiltering out variants with MAF < {maf}') mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF)) mt_filt = mt_filt.filter_rows(mt_filt.maf > maf) print(f'\nFiltering out variants with HWE < {hwe:1e}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe) print(f'\nFiltering out variants with Call Rate < {call_rate}') mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate) # no strand ambiguity print('\nFiltering out strand ambigous variants') mt_filt = mt_filt.filter_rows( ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1])) # MHC chr6:25-35Mb # chr8.inversion chr8:7-13Mb print( '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]' ) intervals = ['chr6:25M-35M', 'chr8:7M-13M'] mt_filt = hl.filter_intervals(mt_filt, [ hl.parse_locus_interval(x, reference_genome='GRCh38') for x in intervals ], keep=False) # This step is expensive (on local machine) print( f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}' ) mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window) mt_ld_pruned = mt_filt.filter_rows( hl.is_defined(mt_ld_prune[mt_filt.row_key])) print("\nNumber of SNPs after filtering: {}".format( mt_ld_pruned.count_rows())) return mt_ld_pruned
def query(output): # pylint: disable=too-many-locals """Query script entry point.""" hl.init(default_reference='GRCh38') hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles') # filter to loci that are contained in both matrix tables after densifying tob_wgs = hl.experimental.densify(tob_wgs) # Entries and columns must be identical tob_wgs_select = tob_wgs.select_entries( GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)) hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT) hgdp_1kg_select = hgdp_1kg_select.select_cols() # Join datasets hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select) # Add in metadata information hgdp_1kg_metadata = hgdp_1kg.cols() hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols( hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s]) # choose variants based off of gnomAD v3 parameters hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows( IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT, hgdp1kg_tobwgs_joined.variant_qc.AF[1])) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2) & (hgdp1kg_tobwgs_joined.locus.in_autosome()) & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01) & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99) & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25)) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache() nrows = hgdp1kg_tobwgs_joined.count_rows() print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}') hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows( NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345) pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT, r2=0.1, bp_window_size=500000) hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows( hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key])) mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt' hgdp1kg_tobwgs_joined.write(mt_path)
def query(output): """Query script entry point.""" hl.init(default_reference='GRCh38') mt_path = f'{output}/filtered_mt.mt' mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT) # reproduce gnomAD genotype filtering mt = annotate_adj(mt) mt = mt.filter_entries(mt.adj) mt = hl.variant_qc(mt) # Filter to common and biallelic variants mt = mt.filter_rows((hl.len(mt.alleles) == 2) & (mt.variant_qc.AF[1] > 0.05)) pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000) filtered_mt = mt.filter_rows( hl.is_defined(pruned_variant_table[mt.row_key])) # save filtered mt table filtered_mt.write(mt_path, overwrite=True)
def ld_prune_filter(mt: hl.MatrixTable, mt_ld: str, overwrite: bool = False): """ Runs variant QC and filters out rare variants, those with missingness, and LD prunes to independent variants :param mt: Matrix table to run variant QC on and filter variants from :param mt_ld: Path to write intermediate filtered mt :param overwrite: if True, overwrites existing data :return: """ mt.describe() mt = hl.variant_qc(mt) # mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.01) & (mt.variant_qc.AF[0] < 0.99)) mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.05) & (mt.variant_qc.AF[0] < 0.95) & (mt.variant_qc.call_rate > 0.999)) # pruned = hl.ld_prune(mt_filt.GT, r2=0.2, bp_window_size=500000) pruned = hl.ld_prune(mt_filt.GT, r2=0.1, bp_window_size=500000) mt_filt = mt_filt.filter_rows(hl.is_defined(pruned[mt_filt.row_key])) mt_filt.write(mt_ld, overwrite)
def ld_prune(mt, args): """ LD prune a matrix table, for calculating kinship and principal components :param mt: matrix table to annotate, should already have related individuals removed. :param args: namespace object with threshold arguments :return: returns the ld pruned matrix table """ pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2, bp_window_size=args.bp_window_size) mt_ldpruned = mt.filter_rows( hl.is_defined(pruned_variant_table[mt.row_key])) logging.info( f"Variant and sample count after LD pruning: {mt_ldpruned.count()}") mt_ldpruned = mt_ldpruned.annotate_globals( ld_pruning_parameters={ 'r2': args.r2, 'bp_window_size': args.bp_window_size }) return mt_ldpruned
def ld_prune_profile_25(mt_path): mt = hl.read_matrix_table(mt_path) mt = mt.filter_rows(hl.len(mt.alleles) == 2) hl.ld_prune(mt.GT)._force_count()
def get_qc_mt( mt: hl.MatrixTable, adj_only: bool = True, min_af: Optional[float] = 0.001, min_callrate: Optional[float] = 0.99, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, apply_hard_filters: bool = True, ld_r2: Optional[float] = 0.1, filter_lcr: bool = True, filter_decoy: bool = True, filter_segdup: bool = True, filter_exome_low_coverage_regions: bool = False, high_conf_regions: Optional[List[str]] = None, ) -> hl.MatrixTable: """ Creates a QC-ready MT by keeping: - Variants outside known problematic regions - Bi-allelic SNVs only - Variants passing hard thresholds - Variants passing the set call rate and MAF thresholds - Genotypes passing on gnomAD ADJ criteria (GQ>=20, DP>=10, AB>0.2 for hets) In addition, the MT will be LD-pruned if `ld_r2` is set. :param mt: Input MT :param adj_only: If set, only ADJ genotypes are kept. This filter is applied before the call rate and AF calculation. :param min_af: Minimum allele frequency to keep. Not applied if set to ``None``. :param min_callrate: Minimum call rate to keep. Not applied if set to ``None``. :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to ``None``. :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to ``None``. :param apply_hard_filters: Whether to apply standard GAKT default site hard filters: QD >= 2, FS <= 60 and MQ >= 30 :param ld_r2: Minimum r2 to keep when LD-pruning (set to `None` for no LD pruning) :param filter_lcr: Filter LCR regions :param filter_decoy: Filter decoy regions :param filter_segdup: Filter segmental duplication regions :param filter_exome_low_coverage_regions: If set, only high coverage exome regions (computed from gnomAD are kept) :param high_conf_regions: If given, the data will be filtered to only include variants in those regions :return: Filtered MT """ logger.info("Creating QC MatrixTable") if ld_r2 is not None: logger.warning( "The LD-prune step of this function requires non-preemptible workers only!" ) # qc_mt = filter_low_conf_regions( # mt, # filter_lcr=filter_lcr, # filter_decoy=filter_decoy, # filter_segdup=filter_segdup, # filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, # high_conf_regions=high_conf_regions, # ) # if adj_only: # qc_mt = filter_to_adj( # qc_mt # ) # TODO: Make sure that this works fine before call rate filtering qc_mt = filter_rows_for_qc( mt, min_af, min_callrate, min_inbreeding_coeff_threshold, min_hardy_weinberg_threshold, apply_hard_filters, ) if ld_r2 is not None: qc_mt = qc_mt.persist() unfiltered_qc_mt = qc_mt.unfilter_entries() pruned_ht = hl.ld_prune(unfiltered_qc_mt.GT, r2=ld_r2) qc_mt = qc_mt.filter_rows(hl.is_defined(pruned_ht[qc_mt.row_key])) qc_mt = qc_mt.annotate_globals(qc_mt_params=hl.struct( adj_only=adj_only, min_af=min_af if min_af is not None else hl.null(hl.tfloat32), min_callrate=min_callrate if min_callrate is not None else hl. null(hl.tfloat32), inbreeding_coeff_threshold=min_inbreeding_coeff_threshold if min_inbreeding_coeff_threshold is not None else hl.null(hl.tfloat32), min_hardy_weinberg_threshold=min_hardy_weinberg_threshold if min_hardy_weinberg_threshold is not None else hl.null(hl.tfloat32), apply_hard_filters=apply_hard_filters, ld_r2=ld_r2 if ld_r2 is not None else hl.null(hl.tfloat32), filter_exome_low_coverage_regions=filter_exome_low_coverage_regions, high_conf_regions=high_conf_regions if high_conf_regions is not None else hl.null(hl.tarray(hl.tstr)), )) return qc_mt.annotate_cols( sample_callrate=hl.agg.fraction(hl.is_defined(qc_mt.GT)))
hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"]) hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"]) bed_to_exclude_pca = hl.import_bed( f"{temp_dir}/1000g/price_high_ld.bed.txt", reference_genome='GRCh38') project_mt = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt" ) mt_vqc_filtered = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered.mt") # ld pruning logger.info("ld pruning and writing to disk") #pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) pruned_mt = pruned_mt.filter_rows(hl.is_defined( bed_to_exclude_pca[pruned_mt.locus]), keep=False) pruned_mt.write( f"{tmp_dir}/ddd-elgh-ukbb/1000g_chr1_20_snps_filtered_ldpruned.mt", overwrite=True) # run pca logger.info("run pca") pca_evals, pca_scores, loadings_ht = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True)
# filter 5% AF onekg = hl.variant_qc(onekg) onekg = onekg.filter_rows(onekg.variant_qc.AF > 0.05, keep=True) # unphase onekg2 = onekg.annotate_entries( GT=hl.call(onekg.GT[0], onekg.GT[1], phased=False)) print(onekg2.GT.phased.show()) onekg = onekg2 #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ld prune #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ onekg = hl.ld_prune(onekg, n_cores=800, r2=0.2) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write vds #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ onekg.write(onekg_ldpruned_file, overwrite=True) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # write plink #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print('export plink') hl.export_plink(onekg, onekg_plink_prefix, fam_id=onekg.s, id=onekg.s) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # pca
possibleGT #Show unique possible calls occuring in entire dataset unique_allelecalls = mt.aggregate_rows( hl.struct(ref=hl.agg.collect_as_set(mt.alleles[0]), alt=hl.agg.collect_as_set(mt.alleles[1]))) pprint(unique_allelecalls) ######## 2. QUALITY CONTROL VARIANTS ######## 2.1 Optional: Pruning in Linkage disequilibrium # Function works only on biallelic data biallelic_mt = mt.filter_rows(hl.len(mt.alleles) == 2) # Prune with window size of 44 kb for Caucasian and 22 kb for African pruned_t = hl.ld_prune(mt.GT, r2=0.8, bp_window_size=44000, memory_per_core=128) mt = mt.filter_rows(hl.is_defined(pruned_t[mt.row_key])) ######### 2.2 Hardy-weinberg equilibrium # HWE separately for each ethnic group, on only controls. mt_NHW = mt.filter_cols(mt.Race == "Caucasian") mt_NHW = mt_NHW.annotate_rows(hwe_ctrl=hl.agg.filter( mt_NHW.Affection == 'Control', hl.agg.hardy_weinberg_test(mt_NHW.GT))) mt_NHW = mt_NHW.filter_rows(mt_NHW.hwe_ctrl.p_value > 10**-5) mt_AA = mt.filter_cols(mt.Race == "African American") mt_AA = mt_AA.annotate_rows(hwe_ctrl=hl.agg.filter( mt_AA.Affection == 'Control', hl.agg.hardy_weinberg_test(mt_AA.GT))) mt_AA = mt_AA.filter_rows(mt_AA.hwe_ctrl.p_value > 10**-5) # Merge of NHW and AA by columns (samples) mt = mt_AA.union_cols(mt_NHW)
def main(args): hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/') if not args.load_joint_pruned_qc_mt: logger.info('Joining exomes and genomes...') exome_qc_mt = read_and_pre_process_data( qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters')) genome_qc_mt = read_and_pre_process_data( qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters')) joint_qc_mt = exome_qc_mt.union_cols( genome_qc_mt) # NOTE: this is an inner join on rows joint_qc_mt = joint_qc_mt.filter_rows( (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99)) joint_qc_mt.write(qc_mt_path('joint'), args.overwrite) logger.info('LD-pruning joint mt of exomes and genomes...') joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint')) variants, samples = joint_qc_mt.count() logger.info('Pruning {0} variants in {1} samples'.format( variants, samples)) joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1) # Note writing the LD-pruned MT is probably overkill # vs using `filter_rows` to filter sites based on the LD-pruned HT. joint_qc_pruned_mt = joint_qc_mt.filter_rows( hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key])) joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True), args.overwrite) pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True)) variants, samples = pruned_mt.count() logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format( samples, variants)) if not args.skip_pc_relate: logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht', args.overwrite) logger.info('Running PC-Relate...') scores = hl.read_table( qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht') # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes relatedness_ht = hl.pc_relate( pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(relatedness_ht_path, args.overwrite) relatedness_ht = hl.read_table(relatedness_ht_path) if not args.skip_relatedness: infer_ped(GnomADRelatedData('exomes')) infer_ped(GnomADRelatedData('genomes')) logger.info('Making rank file...') rank_table = make_rank_file(rank_annotations_path('joint')) logger.info('Finished making rank file...') related_samples_to_drop_ranked = get_related_samples_to_drop( rank_table, relatedness_ht) related_samples_to_drop_ranked.write( qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht', args.overwrite) pca_mt, related_mt = split_mt_by_relatedness(pruned_mt) if not args.skip_pop_pca: variants, samples = pca_mt.count() logger.info('{} samples after removing relateds'.format(samples)) # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' + pca_mt.s.replace(" ", "_")).replace( "/", "_").key_cols_by('uid') hl.export_plink(plink_mt, qc_temp_data_prefix('joint') + '.unrelated.plink', fam_id=plink_mt.uid, ind_id=plink_mt.uid) logger.info( 'Computing population PCs and annotating with known population labels...' ) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite) pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite) pca_scores = hl.read_table(ancestry_pca_scores_ht_path()) pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path()) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() logger.info( 'Projecting population PCs for {} related samples...'.format(samples)) related_scores = pc_project(related_mt, pca_loadings) relateds = related_mt.cols() relateds = relateds.annotate(scores=related_scores[relateds.key].scores) logger.info('Assigning population annotations...') pop_colnames = ['related', 'known_pop', 'scores'] pop_annots_ht = hl.import_table(known_population_annotations, impute=True).key_by('combined_sample') joint_ht = pca_mt.cols().union(relateds) joint_ht = joint_ht.annotate( known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' + joint_ht.s.replace(' ', '_')].known_pop ) # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed joint_pca_ht = joint_ht.select(*pop_colnames) joint_pca_ht, joint_pca_fit = run_assign_population_pcs( joint_pca_ht, qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz', qc_temp_data_prefix('joint') + '.RF_fit.pkl', pcs=list(range(1, 7))) joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select( 'pop', *pop_colnames) # Add special Estonian pop category for genomes estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate( data_type='genomes').key_by('data_type', 'sample')) joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch) joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when( hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1' ).when(hl.is_defined(joint_ht.pop) & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist() # These are keyed by only `s` genome_mt = get_gnomad_data('genomes', adj=False, split=False, meta_root=None).select_cols() exome_mt = get_gnomad_data('exomes', adj=False, split=False, meta_root=None).select_cols() # Population-specific filtering if not args.skip_calculate_sample_metrics: logger.info( 'Running mini sample QC for platform- and population-specific filtering...' ) gnomad_sample_qc(exome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite) gnomad_sample_qc(genome_mt).cols().select('sample_qc').write( qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite) # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet logger.info('Annotating population and platform assignments...') platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms')) exome_ht = exome_mt.cols() exome_ht = exome_ht.annotate( qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s]) genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters')) genome_ht = genome_mt.cols() genome_ht = genome_ht.annotate( qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform, **joint_ht.filter( joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s]) exome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('exomes') + '.sample_qc.ht') genome_sample_qc_ht = hl.read_table( qc_temp_data_prefix('genomes') + '.sample_qc.ht') exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s]) genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s]) # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev logger.info( 'Calculating platform- and population-specific sample QC thresholds...' ) exome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] exome_pop_platform_filter_ht = compute_stratified_metrics_filter( exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform']) exome_ht = exome_ht.annotate_globals( hl.eval(exome_pop_platform_filter_ht.globals)) exome_ht = exome_ht.annotate( **exome_pop_platform_filter_ht[exome_ht.key]).persist() genome_qc_metrics = [ 'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var' ] genome_pop_platform_filter_ht = compute_stratified_metrics_filter( genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform']) genome_ht = genome_ht.annotate_globals( hl.eval(genome_pop_platform_filter_ht.globals)) genome_ht = genome_ht.annotate( **genome_pop_platform_filter_ht[genome_ht.key]).persist() # Annotate samples that fail their respective filters checkpoint = exome_ht.aggregate( hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} exome samples found passing pop/platform-specific filtering' ) exome_ht.key_by(data_type='exomes', s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'), args.overwrite) checkpoint = genome_ht.aggregate( hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0)) logger.info( f'{checkpoint} genome samples found passing pop/platform-specific filtering' ) genome_ht.key_by(data_type='genomes', s=genome_ht.s).write( qc_ht_path('genomes', 'pop_platform'), args.overwrite)
& (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: mt_1kg_chr1_chr20 = hl.read_matrix_table( f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt" ) #mt_vqc_filtered1 = mt_vqc_filtered.key_rows_by("locus") mt_1kg_chr1_chr20 = mt_1kg_chr1_chr20.key_rows_by("locus") mt_vqc_filtered = mt_vqc_filtered.filter_rows( hl.is_defined(mt_1kg_chr1_chr20.rows()[mt_vqc_filtered.locus])) logger.info("done filtering writing mt") # ld pruning pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) #pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) # remove pruned areas that need to be removed # autosomes only: pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome()) pruned_mt.write(f"{tmp_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt", overwrite=True) # pruned_mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # related_samples_to_drop = hl.read_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht")
``` The last step is to filter the matrix table again by the pruned variants list. For this, <font color='red'>is_defined</font> is useful: ```python mt = mt.filter_rows(hl.is_defined(pruned_variants[mt.row_key])) ``` Be sure to take a look at how pruning changes the number of variants in your dataset using the <font color='red'>count</font> function. """ with herzog.Cell("python"): #We added code to help you monitor the time it takes for pruning. We currently estimate over an hour. start_prune_write_time = time.time() pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000, block_size=1024) elapsed_prune_write_time = time.time() - start_prune_write_time print(timedelta(seconds=elapsed_prune_write_time)) with herzog.Cell("python"): mt = mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key])) with herzog.Cell("markdown"): """ ## Principal Component Analysis In this next section, we'll cover a method for easily visualizing and adjusting for population structure in an association analysis: Principal Component Analysis (PCA). You run PCA using the function <font color='red'>hwe_normalized_pca</font>. For this analysis, we are mainly interested in the scores, and can disregard the eigenvalues and loadings. The `k` parameter determines the number of PCs to return -- as `k` grows, so does the computation time.
def determine_pca_variants( autosomes_only: bool = True, snv_only: bool = True, bi_allelic_only: bool = False, adj_only: bool = True, min_gnomad_v3_ac: Optional[int] = None, high_qual_ccdg_exome_interval_only: bool = False, high_qual_ukbb_exome_interval_only: bool = False, pct_samples_ukbb_exome_interval: float = 0.8, min_joint_af: float = 0.0001, # TODO: Konrad mentioned that he might want to lower this min_joint_callrate: float = 0.95, min_inbreeding_coeff_threshold: Optional[float] = -0.8, min_hardy_weinberg_threshold: Optional[float] = 1e-8, min_ccdg_exome_callrate: float = 0.99, # TODO: What parameter should this start with? min_ukbb_exome_callrate: float = 0.99, # TODO: What parameter should this start with? filter_lcr: bool = True, filter_segdup: bool = True, ld_pruning: bool = True, ld_pruning_dataset: str = "ccdg_genomes", ld_r2: float = 0.1, read_per_dataset_checkpoint_if_exists: bool = False, read_pre_ld_prune_ht_checkpoint_if_exists: bool = False, read_pre_ld_prune_mt_checkpoint_if_exists: bool = False, overwrite: bool = True, filter_washu: bool = False, ) -> None: """ Determine a diverse set of variants for relatedness/ancestry PCA using CCDG, gnomAD v3, and UK Biobank. :param autosomes_only: Whether to filter to variants in autosomes :param snv_only: Whether to filter to SNVs :param bi_allelic_only: Whether to filter to variants that are bi-allelic in either CCDG and gnomAD v3 :param adj_only: If set, only ADJ genotypes (QD >= 2, FS <= 60 and MQ >= 30) are kept. This filter is applied before the call rate and AF calculation :param min_gnomad_v3_ac: Optional lower bound of AC for variants in gnomAD v3 genomes :param high_qual_ccdg_exome_interval_only: Whether to filter to high quality intervals in CCDG exomes :param float pct_samples_ukbb_exome_interval: Percent of samples with over 80% of bases having coverage of over 20x per interval :param high_qual_ukbb_exome_interval_only: Whether to filter to high quality intervals in UKBB 455K exomes :param float pct_samples_ukbb: Percent of samples with coverage greater than 20x over the interval for filtering :param min_joint_af: Lower bound for combined MAF computed from CCDG and gnomAD v3 genomes :param min_joint_callrate: Lower bound for combined callrate computed from CCDG and gnomAD v3 genomes :param min_inbreeding_coeff_threshold: Minimum site inbreeding coefficient to keep. Not applied if set to `None` :param min_hardy_weinberg_threshold: Minimum site HW test p-value to keep. Not applied if set to `None` :param min_ccdg_exome_callrate: Lower bound for CCDG exomes callrate :param min_ukbb_exome_callrate: Lower bound for UKBB exomes callrate :param filter_lcr: Whether to filter LCR regions :param filter_segdup: Whether to filter Segdup regions :param ld_pruning: Whether to conduct LD pruning :param ld_pruning_dataset: Which dataset is used for LD pruning, 'ccdg_genomes' or 'gnomAD_genomes' :param ld_r2: LD pruning cutoff :param read_per_dataset_checkpoint_if_exists: Whether to read the CCDG exome/genome pre filtered HT if it exists. Each dataset possible filtered to: autosomes only, SNVs only, gnomAD v3.1.2 AC filter, CCDG high quality exome intervals, and UK Biobank high quality exome intervals :param read_pre_ld_prune_ht_checkpoint_if_exists: Whether to read in the PCA variant HT with no LD-pruning if it exists :param read_pre_ld_prune_mt_checkpoint_if_exists: Whether to read in the checkpointed MT filtered to variants in the PCA variant HT with no LD-pruning if it exists :param overwrite: Whether to overwrite the final variant HT :param filter_washu: Whether to filter out washU samples :return: Table with desired variants for PCA """ if not read_pre_ld_prune_ht_checkpoint_if_exists: logger.info( "Loading gnomAD v3.1.2 release HT and UK Biobank 455K release HT ..." ) flag = "_without_washu" if filter_washu else "" gnomad_ht = gnomad_public_release("genomes").ht() gnomad_ht = gnomad_ht.select( gnomad_was_split=gnomad_ht.was_split, gnomad_AC=gnomad_ht.freq[0].AC, gnomad_AN=gnomad_ht.freq[0].AN, gnomad_genomes_site_inbreeding_coeff=gnomad_ht.info.InbreedingCoeff, gnomad_genomes_homozygote_count=gnomad_ht.freq[0].homozygote_count, ) if min_hardy_weinberg_threshold is not None: gnomad_ht = gnomad_ht.annotate( gnomad_genomes_hwe=hl.hardy_weinberg_test( hl.int32( (gnomad_ht.gnomad_AN / 2) - gnomad_ht.gnomad_genomes_homozygote_count - ( gnomad_ht.gnomad_AC - (gnomad_ht.gnomad_genomes_homozygote_count * 2) ) ), # Num hom ref genotypes hl.int32( ( gnomad_ht.gnomad_AC - (gnomad_ht.gnomad_genomes_homozygote_count * 2) ) ), # Num het genotypes gnomad_ht.gnomad_genomes_homozygote_count, # Num hom alt genotypes ), ) ukbb_ht = hl.read_table(ukbb_release_ht_path("broad", 7)) ukbb_ht = ukbb_ht.select( ukbb_AC=ukbb_ht.freq[0].AC, ukbb_AN=ukbb_ht.freq[0].AN, ) ukbb_meta_ht = hl.read_table(ukbb_meta_ht_path("broad", 7)) # Only count samples used in the UK Biobank exome frequency calculations ukbb_exome_count = ukbb_meta_ht.filter( ukbb_meta_ht.sample_filters.high_quality & hl.is_defined(ukbb_meta_ht.ukbb_meta.batch) & ~ukbb_meta_ht.sample_filters.related ).count() logger.info("Getting CCDG genome and exome sample counts...") ccdg_genome_count = get_ccdg_vds( "genomes", filter_washu=filter_washu ).variant_data.count_cols() logger.info(f"Number of CCDG genome samples: {ccdg_genome_count}...") ccdg_exome_count = get_ccdg_vds("exomes").variant_data.count_cols() logger.info(f"Number of CCDG exome samples: {ccdg_exome_count} ...") def _initial_filter(data_type): """ Get Table of CCDG variants passing desired filters. Possible filters are: - Autosomes only - SNVs only - gnomAD v3.1.2 AC filter - CCDG high quality exome intervals - UK Biobank high quality exome intervals After densification of the VDS, rows are annotated with: - ccdg_{data_type}_was_split - ccdg_{data_type}_AC - ccdg_{data_type}_AN The filtered and annotated rows are returned as a Table and are also checkpointed :param data_type: Whether data is from genomes or exomes :return: Table of CCDG filtered variants """ logger.info( "Loading CCDG %s VDS and splitting multi-allelics for initial filtering steps...", data_type, ) vds = get_ccdg_vds(data_type, filter_washu=filter_washu) logger.info( f"{vds.variant_data.count_cols()} CCDG {data_type} samples loaded..." ) vds = hl.vds.split_multi(vds) if autosomes_only: logger.info("Filtering CCDG %s VDS to autosomes...", data_type) vds = hl.vds.filter_chromosomes(vds, keep_autosomes=True) ht = vds.variant_data.rows() variant_filter_expr = True if snv_only: logger.info("Filtering CCDG %s VDS to SNVs...", data_type) variant_filter_expr &= hl.is_snp(ht.alleles[0], ht.alleles[1]) if min_gnomad_v3_ac: logger.info( "Filtering CCDG %s VDS to gnomAD v3.1.2 variants with adj-filtered AC > %d...", data_type, min_gnomad_v3_ac, ) variant_filter_expr &= gnomad_ht[ht.key].gnomad_AC > min_gnomad_v3_ac vds = hl.vds.filter_variants(vds, ht.filter(variant_filter_expr), keep=True) if high_qual_ccdg_exome_interval_only: logger.info( f"Filtering CCDG %s VDS to high quality (>80%% of samples with %dX coverage) CCDG exome intervals...", data_type, INTERVAL_DP, ) interval_qc_ht = hl.read_table( get_ccdg_results_path( data_type="exomes", result=f"intervals_{INTERVAL_DP}x" ) ) interval_qc_ht = interval_qc_ht.filter(interval_qc_ht.to_keep) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) if high_qual_ukbb_exome_interval_only: if not autosomes_only: raise ValueError( "UK Biobank interval QC filtering is only available for autosomes!" ) logger.info( "Filtering CCDG %s VDS to high quality (>80%% of samples with 20X coverage) UK Biobank exome intervals...", data_type, ) interval_qc_ht = hl.read_table( ukbb_interval_qc_path("broad", 7, "autosomes") ) # Note: freeze 7 is all included in gnomAD v4 interval_qc_ht = interval_qc_ht.filter( interval_qc_ht["pct_samples_20x"] > pct_samples_ukbb_exome_interval ) vds = hl.vds.filter_intervals( vds, intervals=interval_qc_ht.interval.collect(), keep=True ) logger.info("Densifying filtered CCDG %s VDS...", data_type) mt = hl.vds.to_dense_mt(vds) if adj_only: mt = filter_to_adj(mt) annotation_expr = { f"ccdg_{data_type}_was_split": mt.was_split, f"ccdg_{data_type}_AC": hl.agg.sum(mt.GT.n_alt_alleles()), f"ccdg_{data_type}_AN": hl.agg.count_where(hl.is_defined(mt.GT)) * 2, } if min_inbreeding_coeff_threshold is not None: annotation_expr[ f"ccdg_{data_type}_site_inbreeding_coeff" ] = bi_allelic_site_inbreeding_expr(mt.GT) if min_hardy_weinberg_threshold is not None: annotation_expr[f"ccdg_{data_type}_hwe"] = hl.agg.hardy_weinberg_test( mt.GT ) mt = mt.annotate_rows(**annotation_expr) ht = mt.rows().checkpoint( get_ccdg_results_path( data_type=data_type, mt=False, result=f"pre_filtered_variants_interval{INTERVAL_DP}x{flag}", ), overwrite=(not read_per_dataset_checkpoint_if_exists), _read_if_exists=read_per_dataset_checkpoint_if_exists, ) return ht logger.info( "Creating Table with joint gnomAD v3.1.2 and CCDG genome allele frequencies and callrate...", ) ccdg_genomes_ht = _initial_filter("genomes") ccdg_exomes_ht = _initial_filter("exomes") ht = ccdg_exomes_ht.join(ccdg_genomes_ht, how="inner") ht = ht.annotate(**gnomad_ht[ht.key], **ukbb_ht[ht.key]) ht = ht.annotate( joint_biallelic=(~ht.ccdg_genomes_was_split) | (~ht.gnomad_was_split), joint_AC=ht.ccdg_genomes_AC + ht.gnomad_AC, joint_AN=ht.ccdg_genomes_AN + ht.gnomad_AN, ) total_genome_an = hl.eval( (gnomad_ht.freq_sample_count[0] + ccdg_genome_count) * 2 ) ht = ht.annotate( joint_AF=ht.joint_AC / ht.joint_AN, joint_callrate=ht.joint_AN / total_genome_an, ) ht = ht.checkpoint( f"{get_joint_pca_variants_ht_path(filter_washu=filter_washu)}", overwrite=(not read_pre_ld_prune_ht_checkpoint_if_exists), _read_if_exists=read_pre_ld_prune_ht_checkpoint_if_exists, ) logger.info( "Filtering variants to combined gnomAD v3.1.2 and CCDG genome AF of %.3f and callrate of %.2f, CCDG exome callrate " "of %.2f, and UK Biobank exome callrate of %.2f....", min_joint_af, min_joint_callrate, min_ccdg_exome_callrate, min_ukbb_exome_callrate, ) variant_filter_expr = True if bi_allelic_only: variant_filter_expr &= ht.joint_biallelic if min_inbreeding_coeff_threshold is not None: variant_filter_expr &= ( ht.ccdg_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold ) & ( ht.gnomad_genomes_site_inbreeding_coeff > min_inbreeding_coeff_threshold ) if min_hardy_weinberg_threshold is not None: variant_filter_expr &= ( ht.ccdg_genomes_hwe.p_value > min_hardy_weinberg_threshold ) & (ht.gnomad_genomes_hwe.p_value > min_hardy_weinberg_threshold) variant_filter_expr &= ( (ht.joint_AF > min_joint_af) & (ht.joint_callrate > min_joint_callrate) & (ht.ccdg_exomes_AN / (ccdg_exome_count * 2) > min_ccdg_exome_callrate) & (ht.ukbb_AN / (ukbb_exome_count * 2) > min_ukbb_exome_callrate) ) ht = ht.filter(variant_filter_expr) ht = ht.annotate_globals( autosomes_only=autosomes_only, snv_only=snv_only, adj_only=adj_only, bi_allelic_only=bi_allelic_only, min_gnomad_v3_ac=min_gnomad_v3_ac, high_qual_ccdg_exome_interval_only=high_qual_ccdg_exome_interval_only, high_qual_ukbb_exome_interval_only=high_qual_ukbb_exome_interval_only, filter_lcr=filter_lcr, filter_segdup=filter_segdup, min_af=min_joint_af, min_callrate=min_joint_callrate, min_ccdg_exome_callrate=min_ccdg_exome_callrate, min_ukbb_exome_callrate=min_ukbb_exome_callrate, min_inbreeding_coeff_threshold=min_inbreeding_coeff_threshold, min_hardy_weinberg_threshold=min_hardy_weinberg_threshold, ) ht = filter_low_conf_regions( ht, filter_lcr=filter_lcr, filter_decoy=False, # No decoy for GRCh38 filter_segdup=filter_segdup, ) ht = ht.checkpoint( get_pca_variants_path(ld_pruned=False, filter_washu=filter_washu), overwrite=True, ) else: ht = hl.read_table( get_pca_variants_path( ld_pruned=False, data=ld_pruning_dataset, filter_washu=filter_washu ) ) if ld_pruning: # Whether this is still required? logger.warning( "The LD-prune step of this function requires non-preemptible workers only!" ) logger.info("Creating Table after LD pruning of %s...", ld_pruning_dataset) if ld_pruning_dataset == "ccdg_genomes": vds = get_ccdg_vds("genomes") vds = hl.vds.split_multi(vds, filter_changed_loci=True) vds = hl.vds.filter_variants(vds, ht, keep=True) mt = hl.vds.to_dense_mt(vds) elif ld_pruning_dataset == "gnomad_genomes": mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) logger.info("Converting gnomAD v3.1 MatrixTable to VDS...") mt = mt.select_entries( "END", "LA", "LGT", adj=get_adj_expr(mt.LGT, mt.GQ, mt.DP, mt.LAD) ) vds = hl.vds.VariantDataset.from_merged_representation(mt) logger.info("Performing split-multi and filtering variants...") vds = hl.vds.split_multi(vds, filter_changed_loci=True) vds = hl.vds.filter_variants(vds, ht) logger.info("Densifying data...") mt = hl.vds.to_dense_mt(vds) else: ValueError( "Only options for LD pruning are `ccdg_genomes` and `gnomad_genomes`" ) hl._set_flags(no_whole_stage_codegen="1") mt = mt.checkpoint( get_pca_variants_path(ld_pruned=False, data=ld_pruning_dataset, mt=True), overwrite=(not read_pre_ld_prune_mt_checkpoint_if_exists), _read_if_exists=read_pre_ld_prune_mt_checkpoint_if_exists, ) hl._set_flags(no_whole_stage_codegen=None) ht = hl.ld_prune(mt.GT, r2=ld_r2) ht = ht.annotate_globals(ld_r2=ld_r2, ld_pruning_dataset=ld_pruning_dataset) ht = ht.checkpoint( get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset), overwrite=overwrite, _read_if_exists=(not overwrite), ) mt = mt.filter_rows(hl.is_defined(ht[mt.row_key])) mt.naive_coalesce(1000).write( get_pca_variants_path(ld_pruned=True, data=ld_pruning_dataset, mt=True), overwrite=overwrite, )
vds = vds.filter_entries( ((vds.locus.contig != "chrX") | (vds.locus.contig != "chrY")) & (((vds.AD[0] + vds.AD[1]) / vds.DP < 0.9) | (vds.GT.is_hom_ref() & ((vds.AD[0] / vds.DP < 0.9) | (vds.GQ < 20))) | (vds.GT.is_het() & ((vds.AD[1] / vds.DP < 0.20) | (vds.PL[0] < 20))) | (vds.GT.is_hom_var() & ((vds.AD[1] / vds.DP < 0.9) | (vds.PL[0] < 20))) | (vds.DP > 200)), keep=False) vds = hl.variant_qc(vds) vds = vds.filter_rows( (vds.locus.contig == "chrX") | (vds.locus.contig == "chrY") | ((vds.info.QD > 4) & (vds.variant_qc.callRate > 0.99) & (vds.variant_qc.dpMean > 20) & (vds.variant_qc.AF > 0.05) & (vds.filters.size() == 0) & (vds.variant_qc.AF < 0.95)), keep=True) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # impute sex #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds = vds.filter_rows(hl.is_defined(par[vds.locus]), keep=False) ct = hl.impute_sex(vds.GT, female_threshold=0.6, male_threshold=0.7) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # LD prune #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ vds_ldpruned = hl.ld_prune(vds, 40)
def main(args): hl.init(log='/subpops.log') if args.population == 'all': pcs = list(range(1, 7)) elif args.population == 'eur': pcs = [1, 2, 3] elif args.population == 'eas': pcs = [1, 2] else: pcs = [1, 2] if not args.skip_filtering: pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True)) exome_project_table = hl.read_table( qc_ht_path('exomes', 'hard_filters')).select('project_id') exome_platform_table = hl.read_table(qc_ht_path( 'exomes', 'platforms')).select('qc_platform') exome_table = exome_project_table.annotate(qc_platform=hl.str( exome_platform_table[exome_project_table.key].qc_platform)) genome_table = hl.read_table(qc_ht_path( 'genomes', 'hard_filters')).select('project_id', 'qc_platform') joint_table = exome_table.union(genome_table) exome_pop_table = hl.read_table(qc_ht_path( 'exomes', 'pop_platform')).select('pop') genome_pop_table = hl.read_table(qc_ht_path( 'genomes', 'pop_platform')).select('pop') pop_table = exome_pop_table.union(genome_pop_table) pop_table = pop_table.annotate( project_id=joint_table[pop_table.key].project_id, qc_platform=joint_table[pop_table.key].qc_platform) pruned_mt = pruned_mt.annotate_cols(meta=pop_table[pruned_mt.col_key]) variants, samples = pruned_mt.count() logger.info( f'{samples} samples, {variants} variants found in original joint MT' ) if args.population == 'all': sample_criteria = True elif args.population == 'eur': sample_criteria = (pruned_mt.meta.pop == "nfe") | (pruned_mt.meta.pop == "fin") elif args.population == 'eas': sample_criteria = (pruned_mt.meta.pop == "eas") & (pruned_mt.data_type == "exomes") else: sample_criteria = pruned_mt.meta.pop == args.population pruned_mt = pruned_mt.filter_cols(sample_criteria) variants, samples = pruned_mt.count() logger.info( f'{samples} samples, {variants} variants found in {args.population} in joint MT' ) pca_mt, related_mt = split_mt_by_relatedness(pruned_mt) # Filter variants by callrate on each platform pca_platforms_mt = pca_mt.group_cols_by( pca_mt.meta.qc_platform).aggregate(missing=hl.agg.count_where( hl.is_missing(pca_mt.GT)), total=hl.agg.count()) # All variants must have a callrate at least .999 in each platform, or no more than 1 missing sample if platform <= 1000 samples pca_platforms_mt = pca_platforms_mt.annotate_entries( remove_variant=(hl.case().when( pca_platforms_mt.total > 1000, pca_platforms_mt.missing / pca_platforms_mt.total > 0.001).default( pca_platforms_mt.missing > 1))) pca_platforms_mt = pca_platforms_mt.filter_rows(hl.agg.any( pca_platforms_mt.remove_variant), keep=False) pca_mt = pca_mt.filter_rows( (hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2 > 0.001) & hl.is_defined(pca_platforms_mt.rows()[pca_mt.row_key])) variants, samples = pca_mt.count() logger.info( f'{samples} samples, {variants} variants found in {args.population} in PCA MT after filtering variants by AF and platform callrate' ) pca_pruned = hl.ld_prune(pca_mt.GT, r2=0.1) pca_mt = pca_mt.filter_rows(hl.is_defined(pca_pruned[pca_mt.row_key])) related_mt = related_mt.filter_rows( hl.is_defined(pca_mt.rows()[related_mt.row_key])) pca_mt.write( f"{qc_temp_data_prefix('joint')}.{args.population}.unrelated.filtered.mt", args.overwrite) related_mt.write( f"{qc_temp_data_prefix('joint')}.{args.population}.related.filtered.mt", args.overwrite) pca_mt = hl.read_matrix_table( f"{qc_temp_data_prefix('joint')}.{args.population}.unrelated.filtered.mt" ) related_mt = hl.read_matrix_table( f"{qc_temp_data_prefix('joint')}.{args.population}.related.filtered.mt" ) variants, samples = pca_mt.count() logger.info( f'{samples} samples after removing relateds, {variants} variants after filtering and LD pruning' ) if not args.skip_pop_pca: pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=10, compute_loadings=True) pca_mt = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2) pca_loadings = pca_loadings.annotate( pca_af=pca_mt.rows()[pca_loadings.key].pca_af) pca_scores.write(ancestry_pca_scores_ht_path(args.population), args.overwrite) pca_loadings.write(ancestry_pca_loadings_ht_path(args.population), args.overwrite) pca_scores = hl.read_table(ancestry_pca_scores_ht_path(args.population)) pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path( args.population)) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() logger.info(f'Projecting population PCs for {samples} related samples...') related_ht = pc_project(related_mt, pca_loadings) related_mt = related_mt.annotate_cols( scores=related_ht[related_mt.col_key].scores) logger.info('Assigning population annotations...') pop_colnames = ['related', 'known_pop', 'scores'] # Join MTs, then annotate with known pops, then select out columns we care about, then assign pop pcs joint_ht = pca_mt.cols().union(related_mt.cols()) joint_ht = get_known_populations(joint_ht, args.population) joint_ht = joint_ht.select( *pop_colnames, **{f"PC{i + 1}": joint_ht.scores[i] for i in range(10)}) joint_pca_ht, joint_pca_fit = run_assign_population_pcs( joint_ht, f'{qc_temp_data_prefix("joint")}.RF_pop_assignments.{args.population}.txt.bgz', f'{qc_temp_data_prefix("joint")}.RF_fit.{args.population}.pkl', pcs=pcs) joint_pca_ht = joint_pca_ht.annotate( pop=hl.cond(joint_pca_ht.pop == 'oth', hl.literal(f'o{args.population[:2]}'), joint_pca_ht.pop)) joint_ht = joint_ht.select(**{ f"subpop_{args.population}_PC{i}": joint_ht[f"PC{i}"] for i in range(1, 11) }, subpop=joint_pca_ht[joint_ht.key].pop, known_subpop=joint_pca_ht[ joint_ht.key].known_pop) joint_ht.write(subpop_ht_path(args.population), args.overwrite)
vdsx = vdsnopar.filter_rows((vdsnopar.locus.contig == "chrX") & (vdsnopar.variant_qc.AF >= 0.05) & (vdsnopar.variant_qc.AF <= 0.95)) ct = hl.impute_sex(vdsx.GT, female_threshold=0.6, male_threshold=0.7) vdsct = vdsnopar.cols() ct = ct.annotate(ydp=vdsct[ct.s].ydp) (ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female, ydp=ct.ydp).export(sample_sex_fstat_file)) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # ld pruning #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ print("LD pruning...") vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1) #vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512) print("writing LD pruned VDS...") vds5_ldp.write(vds_ldpruned_common_file, overwrite=True) hl.export_plink(vds5_ldp, vds_ldpruned_common_plink, fam_id=vds5_ldp.s, id=vds5_ldp.s) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # IBD analysis #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # use king until pcrelate works
def main(args): bed_to_exclude_pca = hl.import_bed(locations_exclude_from_pca, reference_genome='GRCh38') cohorts_pop = hl.import_table(cohorts_populations, delimiter="\t").key_by('s') # # overlap AKT dataset overlap_1kg_AKT = hl.import_matrix_table(AKT_overlap) # drop cohorts # annotate with cohorts and populations from s3 table. # save matrixtable mt = hl.read_matrix_table(args.matrixtable) mt = mt.annotate_cols(cohort=cohorts_pop[mt.s].cohort) mt = mt.annotate_cols(original_pop=cohorts_pop[mt.s].known_population) mt = mt.annotate_cols(known_pop=cohorts_pop[mt.s].known_population_updated) # mt = mt.annotate_cols(superpopulation=cohorts_pop[mt.s].superpopulation) mt = mt.annotate_cols(gVCF=cohorts_pop[mt.s].gVCF_ID) mt.write( f"{args.output_dir}/ddd-elgh-ukbb/Sanger_chr1-20-XY_new_cohorts_split_multi_pops.mt", overwrite=True) # filter matrixtable logger.info("wrote mt ") # filter mt mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_mnp(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_indel(mt.alleles[0], mt.alleles[1])) mt = mt.filter_rows(~hl.is_complex(mt.alleles[0], mt.alleles[1])) mt_vqc = hl.variant_qc(mt, name='variant_QC_Hail') # (mt_vqc.variant_QC_Hail.p_value_hwe >= 10 ** -6) & not to use this according to hcm. mt_vqc_filtered = mt_vqc.filter_rows( (mt_vqc.variant_QC_Hail.call_rate >= 0.99) & (mt_vqc.variant_QC_Hail.AF[1] >= 0.05) & (mt_vqc.variant_QC_Hail.AF[1] <= 0.95)) mt_vqc_filtered = mt_vqc_filtered.filter_rows(hl.is_defined( bed_to_exclude_pca[mt_vqc_filtered.locus]), keep=False) # overlap AKT dataset: # overlap_1kg_AKT # mt_1kg_chr1_chr20 = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ancestry_work/1000g_chr1_20_AKT_overlap.mt") overlap_1kg_AKT = overlap_1kg_AKT.key_rows_by("locus") mt_vqc_filtered = mt_vqc_filtered.filter_rows( hl.is_defined(overlap_1kg_AKT.rows()[mt_vqc_filtered.locus])) logger.info("done filtering writing mt") # ld pruning pruned_ht = hl.ld_prune(mt_vqc_filtered.GT, r2=0.2, bp_window_size=500000) #pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt_vqc_filtered.filter_rows( hl.is_defined(pruned_ht[mt_vqc_filtered.row_key])) # remove pruned areas that need to be removed # autosomes only: pruned_mt = pruned_mt.filter_rows(pruned_mt.locus.in_autosome()) pruned_mt.write( f"{args.output_dir}/ddd-elgh-ukbb/chr1_chr20_ldpruned_updated.mt", overwrite=True) # pruned_mt = hl.read_matrix_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_ldpruned.mt") # related_samples_to_drop = hl.read_table( # f"{temp_dir}/ddd-elgh-ukbb/relatedness_ancestry/ddd-elgh-ukbb/chr1_chr20_XY_related_samples_to_remove.ht") logger.info("run_pca_with_relateds") # pca_evals, pca_scores, pca_loadings = run_pca_with_relateds( # pruned_mt, related_samples_to_drop, autosomes_only=True) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pruned_mt.GT, k=10, compute_loadings=True) pca_scores = pca_scores.annotate( known_pop=pruned_mt.cols()[pca_scores.s].known_pop) # mt = mt.annotate_cols( # loadings=pca_loadings[mt_vqc_filtered.col_key].loadings) # mt = mt.annotate_cols(known_pop="unk") # pca_scores = pca_scores.annotate(known_pop="unk") pca_scores.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_scores_after_pruning.ht", overwrite=True) pca_loadings.write( f"{args.output_dir}/ddd-elgh-ukbb/pca_loadings_after_pruning.ht", overwrite=True) with open(f"{args.output_dir}/ddd-elgh-ukbb/pca_evals_after_pruning.txt", 'w') as f: for val in pca_evals: f.write(str(val)) logger.info("assign population pcs") pop_ht, pop_clf = assign_population_pcs(pca_scores, pca_scores.scores, known_col="known_pop", n_estimators=100, prop_train=0.8, min_prob=0.5) pop_ht.write(f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.ht", overwrite=True) pop_ht.export( f"{args.output_dir}/ddd-elgh-ukbb/pop_assignments_updated.tsv.gz")
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='unphase_adj_genotypes', split=True)) # filter to samples passing QC filters logger.info( "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..." ) sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc')) sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters)) mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key]))) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") maf = args.maf_threshold mt = (mt.filter_rows( bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce( 500)) logger.info("Checkpoint: writing filtered MT before LD pruning...") mt = mt.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic'), overwrite=args.overwrite) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key]))) logger.info("Writing filtered MT with ld-pruned variants...") (mt.write(get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered ld-pruned MT...") mt = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort, part='high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA on {mt.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # Annotate eigenvalues as global field pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues})) # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = args.output_ht pca_table = (pca_table.checkpoint(output=output_ht_path, overwrite=args.overwrite)) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("PCA pipeline finalised...")
def main(args): mt = hl.read_matrix_table(args.matrixtable) # ld pruning pruned_ht = hl.ld_prune(mt.GT, r2=0.1) pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key])) pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True) # PC relate pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True) relatedness_ht = hl.pc_relate(pruned_mt.GT, min_individual_maf=0.05, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=0.05, statistics='kin2') relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht", overwrite=True) pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125) related_samples_to_remove = hl.maximal_independent_set(pairs.i, pairs.j, keep=False) related_samples_to_remove.write( f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True) pca_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=False) related_mt = pruned_mt.filter_cols(hl.is_defined( related_samples_to_remove[pruned_mt.col_key]), keep=True) variants, samples = pca_mt.count() print(f"{samples} samples after relatedness step.") # Population pca plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid') hl.export_plink(plink_mt, f"{args.output_dir}/mt_unrelated.plink", fam_id=plink_mt.uid, ind_id=plink_mt.uid) pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca( pca_mt.GT, k=20, compute_loadings=True) pca_af_ht = pca_mt.annotate_rows( pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows() pca_loadings = pca_loadings.annotate( pca_af=pca_af_ht[pca_loadings.key].pca_af) pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True) pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True) pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores) variants, samples = related_mt.count() print( 'Projecting population PCs for {} related samples...'.format(samples)) #related_scores = pc_project(related_mt, pca_loadings) #relateds = related_mt.cols() #relateds = relateds.annotate(scores=related_scores[relateds.key].scores) pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True) p = hl.plot.scatter(pca_mt.scores[0], pca_mt.scores[1], title='PCA', xlabel='PC1', ylabel='PC2') output_file(f"{args.plot_dir}/pca.html") save(p)
def main(args): # Start Hail hl.init(default_reference=args.default_reference) if not args.skip_filter_step: logger.info("Importing data...") # import unfiltered MT mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered') # Read MT from 1kgenome and keep only locus defined in interval mt_1kg = get_1kg_mt(args.default_reference) # Joining dataset (inner join). Keep only 'GT' entry field mt_joint = (mt.select_entries('GT').union_cols( mt_1kg.select_entries('GT'), row_join_type='inner')) logger.info( "Filtering joint MT to bi-allelic, high-callrate, common SNPs...") mt_joint = (mt_joint.filter_rows( bi_allelic_expr(mt_joint) & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1]) & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001) & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)). naive_coalesce(1000)) logger.info( "Checkpoint: writing joint filtered MT before LD pruning...") mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path( dataset=args.exome_cohort, part='joint_1kg_high_callrate_common_snp_biallelic'), overwrite=True) logger.info( f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..." ) # remove correlated variants pruned_variant_table = hl.ld_prune(mt_joint.GT, r2=args.ld_prune_r2, bp_window_size=500000, memory_per_core=512) mt_joint = (mt_joint.filter_rows( hl.is_defined(pruned_variant_table[mt_joint.row_key]))) logger.info("Writing filtered joint MT with variants in LD pruned...") (mt_joint.write(get_qc_mt_path( dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True), overwrite=args.overwrite)) logger.info("Importing filtered joint MT...") mt_joint = hl.read_matrix_table( get_qc_mt_path(dataset=args.exome_cohort + '_1kg', part='joint_high_callrate_common_snp_biallelic', split=True, ld_pruned=True)) logger.info(f"Running PCA with {mt_joint.count_rows()} variants...") # run pca on merged dataset eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT, k=args.n_pcs) logger.info(f"Eigenvalues: {eigenvalues}") # TODO: save eigenvalues? # Annotate PC array as independent fields. pca_table = (pc_scores.annotate( ** {'PC' + str(k + 1): pc_scores.scores[k] for k in range(0, args.n_pcs)}).drop('scores')) logger.info(f"Writing HT with PCA results...") # write as HT output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort, part='joint_pca_1kg') pca_table.write(output=output_ht_path) if args.write_to_file: (pca_table.export(f'{output_ht_path}.tsv.bgz')) # Stop Hail hl.stop() print("Done!")
def test_ld_prune(self): ds = hl.split_multi_hts( hl.import_vcf(resource('sample.vcf'))) hl.ld_prune(ds, 8).count_rows()
def main(args): # Init Hail hl.init(default_reference=args.default_reference) if not args.skip_compute_pc_relate: if not args.skip_filter_data: # Read MatrixTable mt = hl.read_matrix_table(args.mt_input_path) # filter variants (bi-allelic, high-callrate, common SNPs) logger.info( f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..." ) mt = (mt.filter_rows( (hl.len(mt.alleles) == 2) & hl.is_snp(mt.alleles[0], mt.alleles[1]) & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold) & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99) & ~mt.was_split).repartition(500, shuffle=False)) # keep only GT entry field and force to evaluate expression (mt.select_entries(mt.GT).write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt', overwrite=args.overwrite)) mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt' ) if not args.skip_prune_ld: # LD pruning # Avoid filtering / missingness entries (genotypes) before run LP pruning # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate" # mt = mt.unfilter_entries() # Prune variants in linkage disequilibrium. # Return a table with nearly uncorrelated variants logger.info( f'Pruning variants in LD from MT with {mt.count_rows()} variants...' ) pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2) # Keep LD-pruned variants pruned_mt = (mt.filter_rows(hl.is_defined( pruned_variant_table[mt.row_key]), keep=True)) pruned_mt.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt', overwrite=args.overwrite) pruned_mt = hl.read_matrix_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt') v, s = pruned_mt.count() logger.info(f'{s} samples, {v} variants found in LD-pruned MT') pruned_mt = pruned_mt.select_entries( GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles())) # run pc_relate method...compute all stats logger.info('Running PCA for PC-Relate...') eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT, k=10, compute_loadings=False) scores.write( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht', overwrite=args.overwrite) logger.info(f'Running PC-Relate...') scores = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht' ) relatedness_ht = hl.pc_relate( call_expr=pruned_mt.GT, min_individual_maf=args.min_individual_maf, scores_expr=scores[pruned_mt.col_key].scores, block_size=4096, min_kinship=args.min_kinship, statistics='all') logger.info(f'Writing relatedness table...') # Write/export table to file relatedness_ht.write( output= f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht', overwrite=args.overwrite) # Write PCs table to file (if specified) # if args.write_to_file: # # Export table to file # relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz') # retrieve maximal independent set of related samples logger.info('Getting optimal set of related samples to prune...') relatedness_ht = hl.read_table( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht') relatedness_ht = (relatedness_ht.flatten().rename({ 'i.s': 'i', 'j.s': 'j' }).repartition(100)) # import trios info fam = import_fam_ht() mat_ids = hl.set(fam.mat_id.collect()) fat_ids = hl.set(fam.pat_id.collect()) # rank samples by retention priority (e.g. cases over controls) tb_rank = make_sample_rank_table(get_sample_meta_data()) # apply min kinship to consider related pairs relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP)) # run maximal_independent_set stratified by groups # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want # keep them (index) since they are mostly affected individuals rather than parents). # defining pairs group # TODO: check groups with updated fam file relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when( relatedness_ht.kin > 0.40, 'twins_or_dups').when( mat_ids.contains(relatedness_ht.i) | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when( fat_ids.contains(relatedness_ht.i) | fat_ids.contains(relatedness_ht.j), 'pairs_child_fat').default('pairs_others'))) groups = (relatedness_ht.aggregate( hl.agg.collect_as_set(relatedness_ht['pairs_group']))) tbs = [] for pair_group in groups: pair_ht = relatedness_ht.filter( relatedness_ht.pairs_group == pair_group) tb = get_related_samples_to_drop(rank_table=tb_rank, relatedness_ht=pair_ht) tbs.append(tb) related_samples_to_remove = hl.Table.union(*tbs) related_samples_to_remove.describe() related_samples_to_remove = related_samples_to_remove.checkpoint( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht', overwrite=args.overwrite) if args.write_to_file: (related_samples_to_remove.flatten().export( f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv' )) hl.stop()