def generate_allele_data(ht: hl.Table) -> hl.Table: """ Returns bi-allelic sites HT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param Table ht: Full unsplit HT :return: Table with allele data annotations :rtype: Table """ ht = ht.select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == "*", ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) ht = ht.filter(hl.len(ht.alleles) > 1) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), "snv").when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), "ins").when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), "del").default("complex")) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == "mixed")) return ht
def generate_allele_data(mt: hl.MatrixTable) -> hl.Table: """ Writes bi-allelic sites MT with the following annotations: - allele_data (nonsplit_alleles, has_star, variant_type, and n_alt_alleles) :param MatrixTable mt: Full unsplit MT :return: Table with allele data annotations :rtype: Table """ ht = mt.rows().select() allele_data = hl.struct(nonsplit_alleles=ht.alleles, has_star=hl.any(lambda a: a == '*', ht.alleles)) ht = ht.annotate(allele_data=allele_data.annotate( **add_variant_type(ht.alleles))) ht = hl.split_multi_hts(ht) allele_type = (hl.case().when( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv').when(hl.is_insertion(ht.alleles[0], ht.alleles[1]), 'ins').when(hl.is_deletion(ht.alleles[0], ht.alleles[1]), 'del').default('complex')) ht = ht.annotate(allele_data=ht.allele_data.annotate( allele_type=allele_type, was_mixed=ht.allele_data.variant_type == 'mixed')) return ht
def main(args): print("main") run_hash = "91b132aa" ht = hl.read_table( f'{temp_dir}/ddd-elgh-ukbb/variant_qc/models/{run_hash}/{run_hash}_rf_result_ranked_denovo_ddd_comp.ht' ) mt = hl.read_matrix_table( f'{temp_dir}/ddd-elgh-ukbb/Sanger_cohorts_chr1-7and20_split_sampleqc_filtered.mt' ) mt = mt.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt.alleles[0], mt.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt.alleles[0], mt.alleles[1]), "INDEL", "Other")))) mt = mt.annotate_rows(info=mt.info.annotate( rf_probability=ht[mt.row_key].rf_probability['TP'])) mt = mt.annotate_rows(info=mt.info.annotate(score=ht[mt.row_key].score)) filter_column_annotation = ( hl.case().when( ((mt.Variant_Type == "SNP") & (mt.info.rf_probability <= 0.90)), "PASS").when(((mt.Variant_Type == "INDEL") & (mt.info.rf_probability <= 0.80)), "PASS").default(".") # remove everything else ) # mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation) mt1 = mt.annotate_rows(filtercol=((filter_column_annotation))) mt_fail = mt1.filter_rows(mt1.filtercol == ".") print(mt_fail.count()) mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol)) mt_fail2 = mt2.filter_rows(mt2.filters.contains(".")) mt_pass = mt2.filter_rows(mt2.filters.contains("PASS")) print(mt_fail2.count()) print(mt_pass.count()) mt2 = mt2.checkpoint( f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.mt', overwrite=True) hl.export_vcf( mt2, f'{tmp_dir}/Sanger_cohorts_chr1-7and20_after_RF_final.vcf.bgz', parallel='separate_header')
def generate_split_alleles(mt: hl.MatrixTable) -> hl.Table: allele_data = hl.struct(nonsplit_alleles=mt.alleles, has_star=hl.any(lambda a: a == '*', mt.alleles)) mt = mt.annotate_rows(allele_data=allele_data.annotate( **add_variant_type(mt.alleles))) mt = hl.split_multi_hts(mt, left_aligned=True) allele_type = (hl.case().when( hl.is_snp(mt.alleles[0], mt.alleles[1]), 'snv').when(hl.is_insertion(mt.alleles[0], mt.alleles[1]), 'ins').when(hl.is_deletion(mt.alleles[0], mt.alleles[1]), 'del').default('complex')) mt = mt.annotate_rows(allele_data=mt.allele_data.annotate( allele_type=allele_type, was_mixed=mt.allele_data.variant_type == 'mixed')) return mt
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: # Count variants for ranking count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp( ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')} rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}") ht_truth_data = hl.read_table( f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht") ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) ht = ht.annotate( **ht_truth_data[ht.key], # **fam_ht[ht.key], # **gnomad_ht[ht.key], # **denovo_ht[ht.key], # clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()), rank_bins=hl.array( [hl.Struct( rank_id=rank_name, bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins))) ) for rank_name in rank_variant_counts] ), # lcr=hl.is_defined(lcr_intervals[ht.locus]) ) ht = ht.explode(ht.rank_bins) ht = ht.transmute( rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin ) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True) # Create binned data return ( ht .group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.transmitted_singleton, trans_singletons=hl.is_defined(ht.singleton_rank), de_novo_high_quality=ht.de_novo_high_quality_rank, de_novo_medium_quality=hl.is_defined( ht.de_novo_medium_quality_rank), de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank), # release_adj=ht.ac > 0, bin=ht.bin )._set_buffer_size(20000) .aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition( ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion( ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), # n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.transmitted_singleton), n_high_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.99), n_validated_DDD_denovos=hl.agg.count_where( ht.inheritance.contains("De novo")), n_medium_quality_de_novos=hl.agg.count_where( ht.de_novo_data.p_de_novo[0] > 0.5), n_high_confidence_de_novos=hl.agg.count_where( ht.de_novo_data.confidence[0] == 'HIGH'), n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum( ht.family_stats.mendel[0].errors)), n_high_quality_de_novos_synonymous=hl.agg.count_where( (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")), # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where( ht.family_stats.mendel[0].errors > 0)), # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & ( # ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)), n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & ( ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)), n_omni=hl.agg.count_where(ht.omni), n_mills=hl.agg.count_where(ht.mills), n_hapmap=hl.agg.count_where(ht.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.kgp_phase1_hc), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site) ) )
def create_binned_data(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table: """ Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton containing the information needed for evaluation plots. :param Table ht: Input rank table :param str data: Which data/run hash is being created :param str data_type: one of 'exomes' or 'genomes' :param int n_bins: Number of bins. :return: Binned Table :rtype: Table """ # Count variants for ranking count_expr = { x: hl.agg.filter( hl.is_defined(ht[x]), hl.agg.counter( hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank') } rank_variant_counts = ht.aggregate(hl.Struct(**count_expr)) logger.info( f"Found the following variant counts:\n {pformat(rank_variant_counts)}" ) ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts) # Load external evaluation data clinvar_ht = hl.read_table(clinvar_ht_path) denovo_ht = get_validated_denovos_ht() if data_type == 'exomes': denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality) else: denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality) denovo_ht = denovo_ht.select( validated_denovo=denovo_ht.validated, high_confidence_denovo=denovo_ht.Confidence == 'HIGH') ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data')) fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats')) fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0]) gnomad_ht = get_gnomad_data(data_type).rows() gnomad_ht = gnomad_ht.select( vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE, vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE, fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) | (gnomad_ht.info.MQ < 30)) lcr_intervals = hl.import_locus_intervals(lcr_intervals_path) ht = ht.annotate( **ht_truth_data[ht.key], **fam_ht[ht.key], **gnomad_ht[ht.key], **denovo_ht[ht.key], clinvar=hl.is_defined(clinvar_ht[ht.key]), indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()), rank_bins=hl.array([ hl.Struct( rank_id=rank_name, bin=hl.int( hl.ceil( hl.float(ht[rank_name] + 1) / hl.floor( ht.globals.rank_variant_counts[rank_name][hl.cond( hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))) for rank_name in rank_variant_counts ]), lcr=hl.is_defined(lcr_intervals[ht.locus])) ht = ht.explode(ht.rank_bins) ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin) ht = ht.filter(hl.is_defined(ht.bin)) ht = ht.checkpoint( f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht', overwrite=True) # Create binned data return (ht.group_by( rank_id=ht.rank_id, contig=ht.locus.contig, snv=hl.is_snp(ht.alleles[0], ht.alleles[1]), bi_allelic=hl.is_defined(ht.biallelic_rank), singleton=ht.singleton, release_adj=ht.ac > 0, bin=ht.bin)._set_buffer_size(20000).aggregate( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where( hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where( hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where( hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where( hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(ht.indel_length == 1), n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0), n_clinvar=hl.agg.count_where(ht.clinvar), n_singleton=hl.agg.count_where(ht.singleton), n_validated_de_novos=hl.agg.count_where(ht.validated_denovo), n_high_confidence_de_novos=hl.agg.count_where( ht.high_confidence_denovo), n_de_novo=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)), n_de_novo_sites=hl.agg.filter( ht.family_stats.unrelated_qc_callstats.AC[1] == 0, hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_de_novo_sites_no_lcr=hl.agg.filter( ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)), n_trans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.t)), n_untrans_singletons=hl.agg.filter( (ht.info_ac < 3) & (ht.family_stats.unrelated_qc_callstats.AC[1] == 1), hl.agg.sum(ht.family_stats.tdt.u)), n_train_trans_singletons=hl.agg.count_where( (ht.family_stats.unrelated_qc_callstats.AC[1] == 1) & (ht.family_stats.tdt.t == 1)), n_omni=hl.agg.count_where(ht.truth_data.omni), n_mills=hl.agg.count_where(ht.truth_data.mills), n_hapmap=hl.agg.count_where(ht.truth_data.hapmap), n_kgp_high_conf_snvs=hl.agg.count_where( ht.truth_data.kgp_high_conf_snvs), fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters), n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site), n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
def score_bin_agg( ht: hl.GroupedTable, fam_stats_ht: hl.Table ) -> Dict[str, hl.expr.Aggregation]: """ Default aggregation function to add aggregations for min/max of score, number of ClinVar variants, number of truth variants (omni, mills, hapmap, and kgp_phase1), and family statistics. .. note:: This function uses `ht._parent` to get the origin Table from the GroupedTable for the aggregation This can easily be combined with the GroupedTable returned by `compute_grouped_binned_ht` Example: .. code-block:: python binned_ht = create_binned_ht(...) grouped_binned_ht = compute_grouped_binned_ht(binned_ht) agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...)) .. note:: The following annotations should be present: In ht: - score - singleton - positive_train_site - negative_train_site - ac_raw - expected that this is the raw allele count before adj filtering - ac - expected that this is the allele count after adj filtering - ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC - info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters In truth_ht: - omni - mills - hapmap - kgp_phase1_hc In fam_stats_ht: - n_de_novos_adj - n_de_novos_raw - n_transmitted_raw - n_untransmitted_raw Automatic aggregations that will be done are: - `min_score` - minimun of score annotation per group - `max_score` - maiximum of score annotation per group - `n` - count of variants per group - `n_ins` - count of insertion per group - `n_ins` - count of insertion per group - `n_del` - count of deletions per group - `n_ti` - count of transitions per group - `n_tv` - count of trnasversions per group - `n_1bp_indel` - count of one base pair indels per group - `n_mod3bp_indel` - count of indels with a length divisible by three per group - `n_singleton` - count of singletons per group - `fail_hard_filters` - count of variants per group with QD < 2 | FS > 60 | MQ < 30 - `n_vqsr_pos_train` - count of variants that were a VQSR positive train site per group - `n_vqsr_neg_train` - count of variants that were a VQSR negative train site per group - `n_clinvar` - count of clinvar variants - `n_de_novos_singleton_adj` - count of singleton de novo variants after adj filtration - `n_de_novo_singleton` - count of raw unfiltered singleton de novo variants - `n_de_novos_adj` - count of adj filtered de novo variants - `n_de_novos` - count of raw unfiltered de novo variants - `n_trans_singletons` - count of transmitted singletons - `n_untrans_singletons` - count of untransmitted singletons - `n_omni` - count of omni truth variants - `n_mills` - count of mills truth variants - `n_hapmap` - count of hapmap truth variants - `n_kgp_phase1_hc` - count of 1000 genomes phase 1 high confidence truth variants :param ht: Table that aggregation will be performed on :param fam_stats_ht: Path to family statistics HT :return: a dictionary containing aggregations to perform on ht """ # Annotate binned table with the evaluation data ht = ht._parent indel_length = hl.abs(ht.alleles[0].length() - ht.alleles[1].length()) # Load external evaluation data build = get_reference_genome(ht.locus).name clinvar = ( grch37_resources.reference_data.clinvar if build == "GRCh37" else grch38_resources.reference_data.clinvar ).ht()[ht.key] truth_data = ( grch37_resources.reference_data.get_truth_ht() if build == "GRCh37" else grch38_resources.reference_data.get_truth_ht() )[ht.key] fam = fam_stats_ht[ht.key] return dict( min_score=hl.agg.min(ht.score), max_score=hl.agg.max(ht.score), n=hl.agg.count(), n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])), n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])), n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])), n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])), n_1bp_indel=hl.agg.count_where(indel_length == 1), n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0), n_singleton=hl.agg.count_where(ht.singleton), fail_hard_filters=hl.agg.count_where( (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30) ), n_pos_train=hl.agg.count_where(ht.positive_train_site), n_neg_train=hl.agg.count_where(ht.negative_train_site), n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)), n_de_novos_singleton_adj=hl.agg.filter( ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj) ), n_de_novo_singleton=hl.agg.filter( ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw) ), n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj), n_de_novo=hl.agg.sum(fam.n_de_novos_raw), n_trans_singletons=hl.agg.filter( ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw) ), n_untrans_singletons=hl.agg.filter( (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1), hl.agg.sum(fam.n_untransmitted_raw), ), n_train_trans_singletons=hl.agg.filter( (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw) ), n_omni=hl.agg.count_where(truth_data.omni), n_mills=hl.agg.count_where(truth_data.mills), n_hapmap=hl.agg.count_where(truth_data.hapmap), n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc), )
#2. Remove samples that have not passed initial QC: print("2. Remove samples that have not passed initial QC:") mt_result = mt.filter_cols(hl.is_defined(exclude_samples_table[mt.s]), keep=False) #3. Split multi print("3. Split multi") mt_split = hl.split_multi_hts(mt_result, keep_star=False) # 4. annotate SNPs,indels print('Annotating rows with snp and indel info') mt = mt_split.annotate_rows(Variant_Type=hl.cond( (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP", hl.cond( hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL", "Other")))) #4. Sample qc and variant qc print("4. Sample qc and variant qc ") mt_sampleqc = hl.sample_qc(mt, name='sample_QC_Hail') mt2 = hl.variant_qc(mt_sampleqc, name='variant_QC_Hail') #5.Annotate COMMON AND RARE VARIANTS to apply separate filters print("Annotate COMMON AND RARE VARIANTS to apply separate filters") #mt_common = mt_filtered.filter_rows(mt_filtered.variant_qc.AF[1] > 0.05) mt2 = mt2.annotate_rows( maf=hl.cond(mt2.variant_QC_Hail.AF[1] < 0.01, "< 1%", hl.cond(mt2.variant_QC_Hail.AF[1] < 0.05, "1%-5%", ">5%")))
def main(): print("main") run_hash = "91ba5f38" ht=hl.read_table(f'{lustre_dir}/variant_qc/models/{run_hash}_score_binning.ht') mt = hl.read_matrix_table( f'{lustre_dir}/MegaWESSanger_cohorts_sampleQC_filtered.mt') table_cohort = hl.import_table( f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", delimiter="\t").key_by('s') mt = mt.annotate_cols(cohort=table_cohort[mt.s].cohort) df = pd.read_csv( f"{lustre_dir}/sanger_cohorts_corrected_ukbb_july_2020.tsv", sep="\t") cohorts_array = df.cohort.unique() mt = mt.annotate_rows( MAF_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AF)) ) mt = mt.annotate_rows( AN_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AN)) ) mt = mt.annotate_rows( AC_cohorts=hl.agg.group_by(mt.cohort, hl.min(hl.agg.call_stats(mt.GT, mt.alleles).AC)) ) mt = mt.annotate_rows( missingness_cohorts=hl.agg.group_by(mt.cohort, hl.min( (hl.agg.count_where(hl.is_missing(mt['GT']))) / mt.count_rows()*2)) ) mt = mt.annotate_rows( info=mt.info.annotate(cohort_names=mt.MAF_cohorts.keys()) ) mt = mt.annotate_rows( info=mt.info.annotate(MAF_cohorts_values=mt.MAF_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate(AN_cohorts_values=mt.AN_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate(AC_cohorts=mt.AC_cohorts.values()) ) mt = mt.annotate_rows( info=mt.info.annotate( missingness_cohorts_values=mt.missingness_cohorts.values()) ) mt = mt.annotate_rows( Variant_Type=hl.cond((hl.is_snp(mt.alleles[0], mt.alleles[1])), "SNP", hl.cond( hl.is_insertion( mt.alleles[0], mt.alleles[1]), "INDEL", hl.cond(hl.is_deletion(mt.alleles[0], mt.alleles[1]), "INDEL", "Other")))) mt = mt.annotate_rows( info=mt.info.annotate( rf_probability=ht[mt.row_key].rf_probability['TP']) ) mt = mt.annotate_rows( info=mt.info.annotate(score=ht[mt.row_key].score) ) mt = mt.annotate_rows( info=mt.info.annotate(bin=ht[mt.row_key].bin) ) filter_column_annotation = ( hl.case() .when(((mt.Variant_Type == "SNP") & (mt.info.bin <= SNV_PASS_BIN)), "PASS") .when(((mt.Variant_Type == "INDEL") & (mt.info.bin <= INDEL_PASS_BIN)), "PASS") .default(".") # not pass for rest ) # mt_annotated = mt.annotate_rows(mt.filters=filter_column_annotation) mt1 = mt.annotate_rows( filtercol=((filter_column_annotation)) ) mt_fail = mt1.filter_rows(mt1.filtercol == ".") print(mt_fail.count()) mt2 = mt1.annotate_rows(filters=mt1.filters.add(mt1.filtercol)) mt_fail2 = mt2.filter_rows(mt2.filters.contains(".")) mt_pass = mt2.filter_rows(mt2.filters.contains("PASS")) print(f'Failed:{mt_fail2.count()}') print(f'Passed:{mt_pass.count()}') mt2 = mt2.checkpoint( f'{lustre_dir}/variant_qc/megaWES_final_after_RF_{run_hash}.mt', overwrite=True) #Remove gt and entries and samples mt1 = mt2.select_entries() mt_fin = mt2.filter_cols(mt2['s'] == 'sample') chroms=[*range(1,23),"X","Y"] chromosomes=["chr"+ str(chr) for chr in chroms] for chromosome in chromosomes: print(chromosome) mt=mt_fin.filter_rows(mt_fin.locus.contig==chromosome) mt.write(f'{lustre_dir}/final_matrixtables_VCFs/{chromosome}_after_RF_{run_hash}_NOSAMPLES_GT.mt',overwrite=True) hl.export_vcf( mt, f'{lustre_dir}/final_matrixtables_VCFs/VCFs/{chromosome}_after_RF_{run_hash}_LOCI_only',parallel='separate_header')