def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.rbind( old_entry.LGT, lambda lgt: hl.if_else( lgt.is_non_ref(), hl.downcode( lgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lgt)) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.rbind( old_entry.LPGT, lambda lpgt: hl.if_else( lpgt.is_non_ref(), hl.downcode( lpgt, hl.or_else(local_a_index, hl.len(old_entry.LA)) ), lpgt)) dropped_fields.append('LPGT') if 'LAD' in fields: non_ref_ad = hl.or_else(old_entry.LAD[local_a_index], 0) # zeroed if not in LAD new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad]) dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def load_covid_data(all_samples_ht: hl.Table, covid_data_path: str, wave: str = '01'): print(f'Loading COVID wave {wave}...') covid_ht = hl.import_table(covid_data_path, delimiter='\t', missing='', impute=True, key='eid') covid_ht = covid_ht.group_by('eid').aggregate( origin=hl.agg.any(covid_ht.origin == 1), result=hl.agg.any(covid_ht.result == 1), inpatient=hl.agg.any(covid_ht.reqorg == 1), ) # TODO: add aoo parse to separate trait_type (covid_quantitative?) # dob = load_dob_ht(pre_phesant_tsv_path)[ht.key].date_of_birth # ht = ht.annotate(aoo=hl.or_missing(ht.result == 1, hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT') - dob), # specdate=hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')).drop('specdate') ht = all_samples_ht.annotate(**covid_ht[all_samples_ht.key]) centers = hl.literal(ENGLAND_RECRUITMENT_CENTERS) analyses = { 'B1_v2': hl.or_missing(ht.result, ht.inpatient), # fka ANA2 'B1_v2_origin': hl.or_missing(ht.result, ht.origin), # fka ANA2 'C2_v2': hl.or_else(ht.result, False), # fka ANA5 'C2_v2_england_controls': hl.or_missing(centers.contains(ht.recruitment_center), # fka ANA5_england_controls hl.or_else(ht.result, False)), 'C1_v2': ht.result, # fka ANA5_strict 'B2_v2': hl.or_else(ht.result & ht.inpatient, False), # fka ANA6 'B2_v2_origin': hl.or_else(ht.result & ht.origin, False) # fka ANA6 } analysis_names = { 'B1_v2': 'Hospitalized vs non-hospitalized (among COVID-19 positive)', # fka ANA2 'B1_v2_origin': 'Hospitalized vs non-hospitalized (among COVID-19 positive; old definition using "origin" field)', # fka ANA2 'C2_v2': 'COVID-19 positive (controls include untested)', # fka ANA5 'C2_v2_england_controls': 'COVID-19 positive (controls include untested), only patients from centers in England', # fka ANA5_england_controls 'C1_v2': 'COVID-19 positive (controls only COVID-19 negative)', # fka ANA5_strict 'B2_v2': 'Hospitalized vs non-hospitalized (controls include untested)', # ANA6 'B2_v2_origin': 'Hospitalized vs non-hospitalized (controls include untested; old definition using "origin" field)' # ANA6 } assert set(analyses.keys()) == set(analysis_names.keys()) ht = ht.select(**analyses) mt = filter_and_annotate_ukb_data(ht, lambda k, v: True, annotate_with_showcase=False, format_col_name=lambda x: x) mt = mt.key_cols_by(trait_type='categorical', phenocode='COVID19', pheno_sex='both_sexes', coding=mt.phenocode, modifier=wave) mt = mt.annotate_cols(description=hl.literal(analysis_names)[mt.coding]) mt.annotate_cols( n_cases=hl.agg.count_where(mt.value == 1.0), n_controls=hl.agg.count_where(mt.value == 0.0) ).cols().show() return mt
def main(): parser = argparse.ArgumentParser() parser.add_argument("gencode") parser.add_argument("canonical_transcripts") parser.add_argument("hgnc") parser.add_argument("--min-partitions", type=int, default=8) parser.add_argument("--output", required=True) args = parser.parse_args() # Load genes from GTF file genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions) genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol) # Annotate genes with canonical transcript canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions) genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id) # Drop transcripts except for canonical genes = genes.annotate( canonical_transcript=genes.transcripts.filter( lambda transcript: transcript.transcript_id == genes.canonical_transcript_id ).head() ) genes = genes.drop("transcripts") # Annotate genes with information from HGNC hgnc = load_hgnc(args.hgnc) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr))) genes = genes.annotate( symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol), symbol_source=hl.or_else(genes.symbol_source, "gencode"), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( symbol_upper_case=genes.symbol.upper(), search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(genes.previous_symbols) .extend(genes.alias_symbols) .append(genes.gencode_gene_symbol) .map(lambda s: s.upper()) ), ) genes.describe() genes.write(args.output, overwrite=True)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def test_from_entry_expr(self): mt = get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self._assert_eq(a1, a2) self._assert_eq(a1, a3) with hl.TemporaryDirectory(ensure_exists=False) as path: BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self._assert_eq(a1, a4)
def annotate_related_pairs(related_pairs: hl.Table, index_col: str) -> hl.Table: related_pairs = related_pairs.key_by(**related_pairs[index_col]) related_pairs = related_pairs.filter( hl.is_missing(case_parents[related_pairs.key])) return related_pairs.annotate( **{ index_col: related_pairs[index_col].annotate( case_rank=hl.or_else( hl.int(meta_ht[related_pairs.key].is_case), -1), dp_mean=hl.or_else( sample_qc_ht[ related_pairs.key].sample_qc.dp_stats.mean, -1.0)) }).key_by()
def test_from_entry_expr(self): mt = self.get_dataset() mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache() a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy() a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy() a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy() self.assertTrue(np.array_equal(a1, a2)) self.assertTrue(np.array_equal(a1, a3)) path = new_temp_file() BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32) a4 = BlockMatrix.read(path).to_numpy() self.assertTrue(np.array_equal(a1, a4))
def annotate_variants_gnomad_mismatch(mt, gnomad_mismatch_ht): """ Imports a list of 'bad' variants that have significantly different frequencies between gnomad exomes and genomes in NFE population. :param mt: matrix table to annotate :param gnomad_mismatch_ht: string with file location + name of gnomad mismatch variant file to load :return: returns annotated matrix table """ gnomad_mismatch_list = hl.read_table(gnomad_mismatch_ht) gnomad_mismatch_list = gnomad_mismatch_list.annotate(gnomad_mismatch=True) # gnomad_mismatch True/False boolean annotation mt = mt.annotate_rows( gnomad_mismatch_pvalue=gnomad_mismatch_list.index(mt.row_key).p_value, gnomad_mismatch_variantid=gnomad_mismatch_list.index( mt.row_key).variant, gnomad_mismatch=gnomad_mismatch_list.index(mt.row_key).gnomad_mismatch) # Fill in empty values for gnomad mismatch with False mt = mt.annotate_rows( gnomad_mismatch=hl.or_else(mt.gnomad_mismatch, False)) mt = mt.annotate_globals(gnomad_mismatch_file=gnomad_mismatch_ht) return mt
def merge_alleles(alleles): from hail.expr.functions import _num_allele_type, _allele_ints return hl.rbind( alleles.map(lambda a: hl.or_else(a[0], '')) .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''), lambda ref: hl.rbind( alleles.map( lambda al: hl.rbind( al[0], lambda r: hl.array([ref]).extend( al[1:].map( lambda a: hl.rbind( _num_allele_type(r, a), lambda at: hl.cond( (_allele_ints['SNP'] == at) | (_allele_ints['Insertion'] == at) | (_allele_ints['Deletion'] == at) | (_allele_ints['MNP'] == at) | (_allele_ints['Complex'] == at), a + ref[hl.len(r):], a)))))), lambda lal: hl.struct( globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))), local=lal)))
def make_sumstats_bm(sumstats_bm_path, high_quality): meta_mt = hl.read_matrix_table(get_meta_analysis_results_path()) clump_mt = hl.read_matrix_table( get_clumping_results_path(high_quality_only=high_quality)).rename( {'pop': 'clump_pops'}) mt = all_axis_join(meta_mt, clump_mt) mt = separate_results_mt_by_pop(mt, 'clump_pops', 'plink_clump', skip_drop=True) mt = separate_results_mt_by_pop(mt, 'meta_analysis_data', 'meta_analysis', skip_drop=True) mt = mt.filter_cols(mt.meta_analysis_data.pop == mt.clump_pops) mt = explode_by_p_threshold(mt).unfilter_entries() mt = mt.filter_cols((mt.description == 'Type 2 diabetes') & (mt.p_threshold == 1)) BlockMatrix.write_from_entry_expr(hl.or_else( mt.meta_analysis.BETA * hl.is_defined(mt.plink_clump.TOTAL) * hl.int(mt.meta_analysis.Pvalue < mt.p_threshold), 0.0), sumstats_bm_path, overwrite=True)
def pull_out_worst_from_tx_annotate(mt): csq_order = [] for loftee_filter in ["HC", "LC"]: for no_flag in [True, False]: for consequence in CSQ_CODING_HIGH_IMPACT: csq_order.append((loftee_filter, no_flag, consequence)) # prioritization of mis and syn variant on protein coding transcripts csq_order.extend([(hl.null(hl.tstr), True, x) for x in CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT ]) # Any variant on a non protein coding transcript (ie. where LOF = None) csq_order.extend([(hl.null(hl.tstr), True, x) for x in CSQ_CODING_HIGH_IMPACT + CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT]) csq_order = hl.literal({(x): i for i, x in enumerate(csq_order)}) mt = mt.annotate_rows(**hl.sorted( mt.tx_annotation, key=lambda x: csq_order[ (x.lof, hl.or_else(hl.is_missing(x.lof_flag), False), x.csq)])[0]) return mt
def hwe_normalize(call_expr): mt = matrix_table_source('hwe_normalize/call_expr', call_expr) mt = mt.select_entries(__gt=call_expr.n_alt_alleles()) mt = mt.annotate_rows(__AC=agg.sum(mt.__gt), __n_called=agg.count_where(hl.is_defined(mt.__gt))) mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called)) n_variants = mt.count_rows() if n_variants == 0: raise FatalError( "hwe_normalize: found 0 variants after filtering out monomorphic sites." ) info( f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites." ) mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called) mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt * (2 - mt.__mean_gt) * n_variants / 2)) mt = mt.unfilter_entries() normalized_gt = hl.or_else( (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0) return normalized_gt
def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def combine_pheno_files(pheno_file_dict: dict): full_mt: hl.MatrixTable = None for data_type, mt in pheno_file_dict.items(): if 'pheno' in list(mt.col_key): mt = mt.key_cols_by(pheno=hl.str(mt.pheno), coding=mt.coding) criteria = mt.value if data_type == 'categorical' else hl.is_defined( mt.value) mt = mt.annotate_cols(n_cases=hl.agg.count_where(criteria)) mt = mt.select_entries(value=hl.float64(mt.value)) elif 'icd_code' in list(mt.col_key): mt = mt.key_cols_by(pheno=mt.icd_code, coding=mt.icd_version) mt = mt.filter_cols(mt.truncated) mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.any_codes)) mt = mt.select_entries(value=hl.float64(mt.any_codes)) elif 'phecode' in list(mt.col_key): mt = mt.key_cols_by(pheno=mt.phecode, coding=mt.phecode_sex) mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.case_control)) mt = mt.select_entries(value=hl.float64(mt.case_control)) elif 'Generic_Name' in list(mt.col_key): mt = mt.select_entries( value=hl.float64(hl.or_else(hl.len(mt.values) > 0, False))) mt2 = mt.group_cols_by( pheno=mt.Drug_Category_and_Indication, coding=mt.Drug_Category_and_Indication).aggregate( value=hl.float64(hl.agg.any(mt.value > 0))) mt = mt.key_cols_by( pheno=mt.Generic_Name, coding=mt.Drug_Category_and_Indication).select_cols() mt = mt.union_cols(mt2) mt = mt.annotate_cols(n_cases=hl.int64(hl.agg.sum(mt.value))) else: raise ValueError( 'pheno or icd_code not in column key. New data type?') mt = mt.select_cols('n_cases', data_type=data_type, n_defined=hl.agg.count_where( hl.is_defined(mt.value))) if full_mt is None: full_mt = mt else: full_mt = full_mt.union_cols(mt, row_join_type='outer' if data_type == 'prescriptions' else 'inner') full_mt = full_mt.unfilter_entries() return full_mt.select_entries(value=hl.cond( full_mt.data_type == 'prescriptions', hl.or_else(full_mt.value, hl.float64(0.0)), full_mt.value))
def prepare_variant_results(): results_path = pipeline_config.get("SCHEMA", "variant_results_path") annotations_path = pipeline_config.get("SCHEMA", "variant_annotations_path") results = hl.read_table(results_path) results = results.drop("v", "af_case", "af_ctrl") # Add n_denovos to AC_case results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) + hl.or_else(results.n_denovos, 0)) results = results.annotate( source=hl.delimit(hl.sorted(hl.array(results.source)), ", ")) results = results.group_by( "locus", "alleles").aggregate(group_results=hl.agg.collect(results.row_value)) results = results.annotate(group_results=hl.dict( results.group_results.map(lambda group_result: (group_result.analysis_group, group_result.drop("analysis_group"))))) variants = hl.read_table(annotations_path) variants = variants.select( gene_id=variants.gene_id, consequence=hl.case().when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 3), "missense_variant_mpc_>=3").when( (variants.canonical_term == "missense_variant") & (variants.mpc >= 2), "missense_variant_mpc_2-3").when( variants.canonical_term == "missense_variant", "missense_variant_mpc_<2").default( variants.canonical_term), hgvsc=variants.hgvsc_canonical.split(":")[-1], hgvsp=variants.hgvsp_canonical.split(":")[-1], info=hl.struct(cadd=variants.cadd, mpc=variants.mpc, polyphen=variants.polyphen), ) variants = variants.annotate(**results[variants.key]) variants = variants.filter(hl.is_defined(variants.group_results)) return variants
def impute_missing_gp(mt, location: str = 'GP', mean_impute: bool = True): mt = mt.annotate_entries(_gp = mt[location]) if mean_impute: mt = mt.annotate_rows(_mean_gp=hl.agg.array_agg(lambda x: hl.agg.mean(x), mt._gp)) gp_expr = mt._mean_gp else: gp_expr = [1.0, 0.0, 0.0] return mt.annotate_entries(**{location: hl.or_else(mt._gp, gp_expr)}).drop('_gp')
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode( old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode( old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [ old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0) ]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return (hl.case().when( hl.len(ds.alleles) == 1, old_entry.annotate( **{ f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields }).drop(*dropped_fields)).when( hl.or_else(old_entry.LGT.is_hom_ref(), False), old_entry.annotate( **{ f: old_entry[f'L{f}'] if f in ['GT', 'PGT'] else e for f, e in new_exprs.items() }).drop(*dropped_fields)).default( old_entry.annotate(**new_exprs).drop( *dropped_fields)))
def generate_final_rf_ht( ht: hl.Table, snp_cutoff: Union[int, float], indel_cutoff: Union[int, float], inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF, determine_cutoff_from_bin: bool = False, aggregated_bin_ht: Optional[hl.Table] = None, bin_id: Optional[hl.expr.Int32Expression] = None, ) -> hl.Table: """ Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering. If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF probabilities to use as cutoffs from an aggregated quantile bin Table like one created by `compute_grouped_binned_ht` in combination with `score_bin_agg`. :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0) :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht` :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff :return: Finalized random forest Table annotated with variant filters """ # Determine SNP and indel RF cutoffs if given bin instead of RF probability snp_cutoff_global = hl.struct(min_score=snp_cutoff) indel_cutoff_global = hl.struct(min_score=indel_cutoff) # Add filters to RF HT filters = dict() if ht.any(hl.is_missing(ht.rf_probability["TP"])): raise ValueError("Missing RF probability!") filters["RF"] = ( hl.is_snp(ht.alleles[0], ht.alleles[1]) & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | ( ~hl.is_snp(ht.alleles[0], ht.alleles[1]) & (ht.rf_probability["TP"] < indel_cutoff_global.min_score)) # Fix annotations for release annotations_expr = { "rf_positive_label": hl.or_else(ht.tp, False), "rf_negative_label": ht.fail_hard_filters, "rf_probability": ht.rf_probability["TP"], } ht = ht.transmute(filters=add_filters_expr(filters=filters), **annotations_expr) ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global, rf_indel_cutoff=indel_cutoff_global) return ht
def prepare_gene_models(): genes_grch37 = prepare_gene_models_helper("GRCh37") genes_grch38 = prepare_gene_models_helper("GRCh38") genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value) genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value) genes = genes_grch37.join(genes_grch38, how="outer") # Annotate genes with information from HGNC hgnc_path = pipeline_config.get("reference_data", "hgnc_path") hgnc = load_hgnc(hgnc_path) genes = genes.annotate(**hgnc[genes.gene_id]) genes = genes.annotate( symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)), ) # Collect all fields that can be used to search by gene symbol genes = genes.annotate( search_terms=hl.set( hl.empty_array(hl.tstr) .append(genes.symbol) .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr))) .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr))) .append(genes.GRCh38.gencode_gene_symbol) .append(genes.GRCh37.gencode_gene_symbol) .filter(hl.is_defined) .map(lambda s: s.upper()) ), ) gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path") gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path) genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id]) exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path") exac_constraint = prepare_exac_constraint(exac_constraint_path) genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id]) staging_path = pipeline_config.get("output", "staging_path") genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
def compute_last_ref_block_end(mt: hl.MatrixTable) -> hl.Table: """ This function takes a sparse MT and computes for each row the genomic position of the most upstream reference block overlapping that row. Note that since reference blocks do not extend beyond contig boundaries, only the position is kept. This function returns a Table with that annotation. (`last_END_position`). :param mt: Input MatrixTable :return: Output Table with `last_END_position` annotation """ mt = mt.select_entries("END") # Localize entries, so that they can be viewed as an array and scanned over using hl.scan.array_agg ht = mt._localize_entries("__entries", "__cols") # Compute the position by using hl.scan._prev_nonnull. # This was inspired by hl.experimental.densify # _prev_non_null is an aggregator that keeps the previous record in memory # and updates it with the given value at the row if it's not null (missing) # The following code computes the following annotation for each row: # 1. Keep a scan of the entries using _prev_nonnull, keeping the start (ht.locus) and end (entry.END) of each ref block (1.1) # 2. For the current row locus, record the start of the block that starts the furthest away, # that is the minimum position in the current scan for any block that overlaps the current locus (2.1) ht = ht.select( last_END_position=hl.or_else( hl.min( # 2. For the current row locus, record the start of the block that starts the furthest away hl.scan.array_agg( lambda entry: hl.scan._prev_nonnull( # 1. Keep a scan of the entries using _prev_nonnull hl.or_missing( hl.is_defined( entry.END ), # Update the scan whenever a new ref block is encountered hl.tuple( [ # 1.1 keep the start (ht.locus) and end (entry.END) of each ref block ht.locus, entry.END, ] ), ) ), ht.__entries, ).map( lambda x: hl.or_missing( # 2.1 get the start position of blocks that overlap the current locus (x[1] >= ht.locus.position) & (x[0].contig == ht.locus.contig), x[0].position, ) ) ), ht.locus.position, ) ) return ht.select_globals()
def load_hgnc(hgnc_path, min_partitions=8): hgnc = hl.import_table(hgnc_path, min_partitions=min_partitions, missing="") hgnc = hgnc.select( hgnc_id=hgnc["HGNC ID"], symbol=hgnc["Approved symbol"], name=hgnc["Approved name"], previous_symbols=hgnc["Previous symbols"].split(",").map(lambda s: s.strip()), alias_symbols=hgnc["Alias symbols"].split(",").map(lambda s: s.strip()), omim_id=hgnc["OMIM ID(supplied by OMIM)"], gene_id=hl.or_else(hgnc["Ensembl gene ID"], hgnc["Ensembl ID(supplied by Ensembl)"]), ) hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id") return hgnc
def test(self): schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr, f=hl.tarray(hl.tint32), g=hl.tarray( hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)), h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr), i=hl.tbool, j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)) rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5, 'e': "hello", 'f': [1, 2, 3], 'g': [hl.Struct(x=1, y=5, z='banana')], 'h': hl.Struct(a=5, b=3, c='winter'), 'i': True, 'j': hl.Struct(x=3, y=2, z='summer')}] kt = hl.Table.parallelize(rows, schema) result = convert_struct_to_dict(kt.annotate( chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d), ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5), dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])), dpois=hl.dpois(4, kt.a), drop=kt.h.drop('b', 'c'), exp=hl.exp(kt.c), fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d), hwe=hl.hardy_weinberg_p(1, 2, 1), index=hl.index(kt.g, 'z'), is_defined=hl.is_defined(kt.i), is_missing=hl.is_missing(kt.i), is_nan=hl.is_nan(hl.float64(kt.a)), json=hl.json(kt.g), log=hl.log(kt.a, kt.b), log10=hl.log10(kt.c), or_else=hl.or_else(kt.a, 5), or_missing=hl.or_missing(kt.i, kt.j), pchisqtail=hl.pchisqtail(kt.a, kt.b), pcoin=hl.rand_bool(0.5), pnorm=hl.pnorm(0.2), pow=2.0 ** kt.b, ppois=hl.ppois(kt.a, kt.b), qchisqtail=hl.qchisqtail(kt.a, kt.b), range=hl.range(0, 5, kt.b), rnorm=hl.rand_norm(0.0, kt.b), rpois=hl.rand_pois(kt.a), runif=hl.rand_unif(kt.b, kt.a), select=kt.h.select('c', 'b'), sqrt=hl.sqrt(kt.a), to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)], where=hl.cond(kt.i, 5, 10) ).take(1)[0])
def calculate_new_intervals(ht, n, reference_genome): """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable for repartitioning a combiner matrix table Parameters ---------- ht : :class:`.Table` Table / Rows Table to compute new intervals for n : :obj:`int` Number of rows each partition should have, (last partition may be smaller) reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional Reference genome to use. Returns ------- :obj:`List[Interval]` """ assert list(ht.key) == ['locus'] assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome) end = hl.Locus(reference_genome.contigs[-1], reference_genome.lengths[reference_genome.contigs[-1]], reference_genome=reference_genome) n_rows = ht.count() if n_rows == 0: raise ValueError('empty table!') ht = ht.select() ht = ht.annotate(x=hl.scan.count()) ht = ht.annotate(y=ht.x + 1) ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1))) ht = ht.select() ht = ht.annotate(start=hl.or_else( hl.scan._prev_nonnull( hl.locus_from_global_position(ht.locus.global_position() + 1, reference_genome=reference_genome)), hl.locus_from_global_position(0, reference_genome=reference_genome))) ht = ht.key_by() ht = ht.select( interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True)) intervals = ht.aggregate(hl.agg.collect(ht.interval)) last_st = hl.eval( hl.locus_from_global_position( hl.literal(intervals[-1].end).global_position() + 1, reference_genome=reference_genome)) interval = hl.Interval(start=last_st, end=end, includes_end=True) intervals.append(interval) return intervals
def main(args): hl.init(master=f'local[{args.n_threads}]', log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'), default_reference=args.reference) sys.path.append('/') add_args = [] if args.additional_args is not None: add_args = args.additional_args.split(',') load_module = importlib.import_module(args.load_module) mt = getattr(load_module, args.load_mt_function)(*add_args) if args.gene_map_ht_path is None: interval = [hl.parse_locus_interval(args.interval)] else: gene_ht = hl.read_table(args.gene_map_ht_path) if args.gene is not None: gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene) interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False) else: interval = [hl.parse_locus_interval(args.interval)] gene_ht = hl.filter_intervals(gene_ht, interval) gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation)) gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t') ).key_by().drop('start').export(args.group_output_file, header=False) # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants` if not args.no_adj: mt = mt.filter_entries(mt.adj) mt = hl.filter_intervals(mt, interval) if not args.input_bgen: mt = mt.select_entries('GT') mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0) mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1]) if args.callrate_filter: mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter) if args.export_bgen: if not args.input_bgen: mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT)) mt = gt_to_gp(mt) mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing) hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid) else: mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0))) # Note: no mean-imputation for VCF hl.export_vcf(mt, args.output_file)
def with_pl(pl): new_exprs = {} dropped_fields = ['LA'] if 'LGT' in fields: new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LGT') if 'LPGT' in fields: new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA))) dropped_fields.append('LPGT') if 'LAD' in fields: new_exprs['AD'] = hl.or_missing( hl.is_defined(old_entry.LAD), [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD dropped_fields.append('LAD') if 'LPL' in fields: new_exprs['PL'] = pl if 'GQ' in fields: new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ) dropped_fields.append('LPL') return hl.cond(hl.len(ds.alleles) == 1, old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields), old_entry.annotate(**new_exprs).drop(*dropped_fields))
def project_pcs_relateds(mt_ldpruned, mt, covar_pc_num): """ Tales LD pruned matrix table, calculates PCs, and projects those PCs back to related individuals included in mt :param mt_ldpruned: matrix table with relatives removed, maf and ld pruned :param mt: matrix table with relatives included :param covar_pc_num: Number of principal components as covariates to calculate :return: returns matrix table with relatives, with PCs annotated """ logging.info('Calculating principal components, annotating main dataset.') eigenvalues, scores, loadings = hl.hwe_normalized_pca( mt_ldpruned.GT, k=covar_pc_num, compute_loadings=True) # Project PCs to related individuals # mt of related individuals only, not pop outliers or failing samples QC related_mt = mt.filter_cols( (mt.related_to_remove == True) & (mt.pop_outlier_sample == False) & (hl.len(mt.failing_samples_qc) == 0), keep=True) mt_ldpruned = mt_ldpruned.annotate_rows( pca_af=hl.agg.mean(mt_ldpruned.GT.n_alt_alleles()) / 2) mtrows = mt_ldpruned.rows() loadings = loadings.annotate(pca_af=mtrows[loadings.locus, loadings.alleles].pca_af) related_scores = pc_project(related_mt, loadings) # Add pcs as annotations to main table mt = mt.annotate_cols(**{ 'pc' + str(k + 1): scores[mt.s].scores[k] for k in range(covar_pc_num) }) # Explanation: for k principal components in range 0 to covar_pc_num-1, # make pc k+1 (to start at pc1 instead of pc0) be the corresponding score (keyed by mt.s) from the table scores # Add pcs for related individuals mt = mt.annotate_cols( **{ 'pc' + str(k + 1): hl.or_else(mt['pc' + str(k + 1)], related_scores[mt.s].scores[k]) for k in range(covar_pc_num) }) # Explanation: for k principal components in range from 0 to (covar_pc_num-1) # give either the existing pcX, or if missing give the corresponding score (keyed by mt.s) # from the table related_scores return mt
def _collect_scatter_plot_data(x: hl.expr.NumericExpression, y: hl.expr.NumericExpression, fields: Dict[str, hl.expr.Expression] = None, n_divisions: int = None, missing_label: str = 'NA') -> pd.DataFrame: expressions = dict() if fields is not None: expressions.update({ k: hl.or_else(v, missing_label) if isinstance( v, hl.expr.StringExpression) else v for k, v in fields.items() }) if n_divisions is None: collect_expr = hl.struct(_x=x, _y=y, **expressions) plot_data = [ point for point in collect_expr.collect() if point._x is not None and point._y is not None ] source_pd = pd.DataFrame(plot_data) else: if not all( isinstance(v, hl.expr.StringExpression) for v in expressions.values()): print( "WARN: only string expressions are supported with `n_divisions` options at this time. Converting to String" ) expressions = { k: hl.str(v) if not isinstance(v, hl.expr.StringExpression) else v for k, v in expressions.items() } agg_f = x._aggregation_method() res = agg_f( hl.agg.downsample( x, y, label=list(expressions.values()) if expressions else None, n_divisions=n_divisions)) source_pd = pd.DataFrame([ dict(_x=point[0], _y=point[1], **dict(zip(expressions, point[2]))) for point in res ]) return source_pd
def get_expr_for_vep_gene_ids_set(vep_transcript_consequences_root, only_coding_genes=False): """Expression to compute the set of gene ids in VEP annotations for this variant. Args: vep_transcript_consequences_root (ArrayExpression): VEP transcript_consequences root in the struct only_coding_genes (bool): If set to True, non-coding genes will be excluded. Return: SetExpression: expression """ expr = vep_transcript_consequences_root if only_coding_genes: expr = expr.filter( lambda c: hl.or_else(c.biotype, "") == "protein_coding") return hl.set(expr.map(lambda c: c.gene_id))
def run_infer_families() -> hl.Pedigree: logger.info("Inferring families") ped = infer_families(get_relatedness_annotated_ht(), sex.ht(), duplicates.ht()) # Remove all trios containing any QC-filtered sample meta_ht = meta.ht() filtered_samples = meta_ht.aggregate( hl.agg.filter( (hl.len(meta_ht.qc_metrics_filters) > 0) | hl.or_else(hl.len(meta_ht.hard_filters) > 0, False), hl.agg.collect_as_set(meta_ht.s), )) return hl.Pedigree(trios=[ trio for trio in ped.trios if trio.s not in filtered_samples and trio.pat_id not in filtered_samples and trio.mat_id not in filtered_samples ])
def all_and_leave_one_out(x, pop_array, all_f=hl.sum, loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)): """ Applies a function to an input array for all populations, and for each of leave-one-out populations. :param x: Input array :param pop_array: Population array :param all_f: Function for all populations. It takes the input array and returns a new value :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out population and the input array, and returns an array of new values. ... :return: Array of new values for all populations and for each of leave-one-out populations. :rtype: ArrayExpression """ arr = hl.array([all_f(x)]) arr = arr.extend(hl.map(lambda i: loo_f(i, x), hl.range(hl.len(pop_array)))) return hl.or_missing(hl.any(hl.is_defined, x), arr)
def annotate_relateds(mt, relateds_to_remove_file): """ Annotates a matrix table, given a list of individuals (single column, no header) that are related to remove from subsequent analysis. Does not remove the individuals- simply marks them as 'True' in the variable 'related_to_remove' :param mt: matrix table to annotate :param relateds_to_remove_file: file containing IDs of individuals to remove from analyses needing independent cases :return: returns annotated matrix table with new column variable 'related_to_remove' """ # Import list of related individuals to remove, generated by python code/networkx relatives = hl.import_table(relateds_to_remove_file, no_header=True) relatives = relatives.annotate(related_to_remove=True) relatives = relatives.key_by('f0') # Annotate matrix table with relatives to remove mt = mt.annotate_cols(related_to_remove=relatives[mt.s].related_to_remove) mt = mt.annotate_cols( related_to_remove=hl.or_else(mt.related_to_remove, False)) return mt
def liftover_annotations(gnomad_37_path, gnomad_38_path, annotated_gnomad_38_path): """ The 38 liftover of gnomAD is stripped of all global and row annotations. This function annotates the 38 liftover with the original 37 annotations as the combined reference data script needs them. :param gnomad_37_path: path to 37 version of gnomAD for data type :param gnomad_38_path: path to 38 version of gnomAD for data type :param annotated_gnomad_38_path: path to annotated 38 version of gnomAD for data type :return: """ ht_37 = hl.read_table(gnomad_37_path) ht_38 = hl.read_table(gnomad_38_path) ht_38 = ht_38.annotate( original_alleles=hl.or_else(ht_38.original_alleles, ht_38.alleles)) ht_38 = ht_38.key_by('original_locus', 'original_alleles') ht_38 = ht_38.annotate(**ht_37[ht_38.key]) ht_38 = ht_38.annotate_globals(**ht_37.index_globals()) ht_38 = ht_38.key_by('locus', 'alleles') ht_38.write(annotated_gnomad_38_path, overwrite=True) return ht_38
def _collect_scatter_plot_data( x: Tuple[str, NumericExpression], y: Tuple[str, NumericExpression], fields: Dict[str, Expression] = None, n_divisions: int = None, missing_label: str = 'NA' ) -> pd.DataFrame: expressions = dict() if fields is not None: expressions.update({k: hail.or_else(v, missing_label) if isinstance(v, StringExpression) else v for k, v in fields.items()}) if n_divisions is None: collect_expr = hail.struct(**dict((k,v) for k,v in (x,y)), **expressions) plot_data = [point for point in collect_expr.collect() if point[x[0]] is not None and point[y[0]] is not None] source_pd = pd.DataFrame(plot_data) else: # FIXME: remove the type conversion logic if/when downsample supports continuous values for labels # Save all numeric types to cast in DataFrame numeric_expr = {k: 'int32' for k,v in expressions.items() if isinstance(v, Int32Expression)} numeric_expr.update({k: 'int64' for k,v in expressions.items() if isinstance(v, Int64Expression)}) numeric_expr.update({k: 'float32' for k, v in expressions.items() if isinstance(v, Float32Expression)}) numeric_expr.update({k: 'float64' for k, v in expressions.items() if isinstance(v, Float64Expression)}) # Cast non-string types to string expressions = {k: hail.str(v) if not isinstance(v, StringExpression) else v for k,v in expressions.items()} agg_f = x[1]._aggregation_method() res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions)) source_pd = pd.DataFrame([ dict( **{x[0]: point[0], y[0]: point[1]}, **(dict(zip(expressions, point[2])) if point[2] is not None else {}) ) for point in res ]) source_pd = source_pd.astype(numeric_expr, copy=False) return source_pd
def reannotate(mt, gatk_ht, summ_ht): """Re-annotate a sparse MT with annotations from certain GATK tools `gatk_ht` should be a table from the rows of a VCF, with `info` having at least the following fields. Be aware that fields not present in this list will be dropped. ``` struct { AC: array<int32>, AF: array<float64>, AN: int32, BaseQRankSum: float64, ClippingRankSum: float64, DP: int32, FS: float64, MQ: float64, MQRankSum: float64, MQ_DP: int32, NEGATIVE_TRAIN_SITE: bool, POSITIVE_TRAIN_SITE: bool, QD: float64, QUALapprox: int32, RAW_MQ: float64, ReadPosRankSum: float64, SB_TABLE: array<int32>, SOR: float64, VQSLOD: float64, VarDP: int32, culprit: str } ``` `summarize_ht` should be the output of :func:`.summarize` as a rows table. Note ---- You will not be able to run :func:`.combine_gvcfs` with the output of this function. """ def check(ht): keys = list(ht.key) if keys[0] != 'locus': raise TypeError(f'table inputs must have first key "locus", found {keys}') if keys != ['locus']: return hl.Table(TableKeyBy(ht._tir, ['locus'], is_sorted=True)) return ht gatk_ht, summ_ht = [check(ht) for ht in (gatk_ht, summ_ht)] return mt.annotate_rows( info=hl.rbind( gatk_ht[mt.locus].info, summ_ht[mt.locus].info, lambda ginfo, hinfo: hl.struct( AC=hl.or_else(hinfo.AC, ginfo.AC), AF=hl.or_else(hinfo.AF, ginfo.AF), AN=hl.or_else(hinfo.AN, ginfo.AN), BaseQRankSum=hl.or_else(hinfo.BaseQRankSum, ginfo.BaseQRankSum), ClippingRankSum=hl.or_else(hinfo.ClippingRankSum, ginfo.ClippingRankSum), DP=hl.or_else(hinfo.DP, ginfo.DP), FS=ginfo.FS, MQ=hl.or_else(hinfo.MQ, ginfo.MQ), MQRankSum=hl.or_else(hinfo.MQRankSum, ginfo.MQRankSum), MQ_DP=hl.or_else(hinfo.MQ_DP, ginfo.MQ_DP), NEGATIVE_TRAIN_SITE=ginfo.NEGATIVE_TRAIN_SITE, POSITIVE_TRAIN_SITE=ginfo.POSITIVE_TRAIN_SITE, QD=ginfo.QD, QUALapprox=hl.or_else(hinfo.QUALapprox, ginfo.QUALapprox), RAW_MQ=hl.or_else(hinfo.RAW_MQ, ginfo.RAW_MQ), ReadPosRankSum=hl.or_else(hinfo.ReadPosRankSum, ginfo.ReadPosRankSum), SB_TABLE=hl.or_else(hinfo.SB_TABLE, ginfo.SB_TABLE), SOR=ginfo.SOR, VQSLOD=ginfo.VQSLOD, VarDP=hl.or_else(hinfo.VarDP, ginfo.VarDP), culprit=ginfo.culprit, )), qual=gatk_ht[mt.locus].qual, filters=gatk_ht[mt.locus].filters, )
def ld_score_regression(weight_expr, ld_score_expr, chi_sq_exprs, n_samples_exprs, n_blocks=200, two_step_threshold=30, n_reference_panel_variants=None) -> Table: r"""Estimate SNP-heritability and level of confounding biases from GWAS summary statistics. Given a set or multiple sets of genome-wide association study (GWAS) summary statistics, :func:`.ld_score_regression` estimates the heritability of a trait or set of traits and the level of confounding biases present in the underlying studies by regressing chi-squared statistics on LD scores, leveraging the model: .. math:: \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j * :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic for variant :math:`j` resulting from a test of association between variant :math:`j` and a trait. * :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant :math:`j`, calculated as the sum of squared correlation coefficients between variant :math:`j` and nearby variants. See :func:`ld_score` for further details. * :math:`a` captures the contribution of confounding biases, such as cryptic relatedness and uncontrolled population structure, to the association test statistic. * :math:`h_g^2` is the SNP-heritability, or the proportion of variation in the trait explained by the effects of variants included in the regression model above. * :math:`M` is the number of variants used to estimate :math:`h_g^2`. * :math:`N` is the number of samples in the underlying association study. For more details on the method implemented in this function, see: * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__ Examples -------- Run the method on a matrix table of summary statistics, where the rows are variants and the columns are different phenotypes: >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=mt_gwas['ld_score'], ... ld_score_expr=mt_gwas['ld_score'], ... chi_sq_exprs=mt_gwas['chi_squared'], ... n_samples_exprs=mt_gwas['n']) Run the method on a table with summary statistics for a single phenotype: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=ht_gwas['chi_squared_50_irnt'], ... n_samples_exprs=ht_gwas['n_50_irnt']) Run the method on a table with summary statistics for multiple phenotypes: >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht') >>> ht_results = hl.experimental.ld_score_regression( ... weight_expr=ht_gwas['ld_score'], ... ld_score_expr=ht_gwas['ld_score'], ... chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'], ... ht_gwas['chi_squared_20160']], ... n_samples_exprs=[ht_gwas['n_50_irnt'], ... ht_gwas['n_20160']]) Notes ----- The ``exprs`` provided as arguments to :func:`.ld_score_regression` must all be from the same object, either a :class:`Table` or a :class:`MatrixTable`. **If the arguments originate from a table:** * The table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and ``n_samples_exprs`` are must be row-indexed fields. * The number of expressions passed to ``n_samples_exprs`` must be equal to one or the number of expressions passed to ``chi_sq_exprs``. If just one expression is passed to ``n_samples_exprs``, that sample size expression is assumed to apply to all sets of statistics passed to ``chi_sq_exprs``. Otherwise, the expressions passed to ``chi_sq_exprs`` and ``n_samples_exprs`` are matched by index. * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have generic :obj:`int` values ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc. expressions passed to the ``chi_sq_exprs`` argument. **If the arguments originate from a matrix table:** * The dimensions of the matrix table must be variants (rows) by phenotypes (columns). * The rows of the matrix table must be keyed by fields ``locus`` of type :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of :py:data:`.tstr` elements. * The columns of the matrix table must be keyed by a field of type :py:data:`.tstr` that uniquely identifies phenotypes represented in the matrix table. The column key must be a single expression; compound keys are not accepted. * ``weight_expr`` and ``ld_score_expr`` must be row-indexed fields. * ``chi_sq_exprs`` must be a single entry-indexed field (not a list of fields). * ``n_samples_exprs`` must be a single entry-indexed field (not a list of fields). * The ``phenotype`` field that keys the table returned by :func:`.ld_score_regression` will have values corresponding to the column keys of the input matrix table. This function returns a :class:`Table` with one row per set of summary statistics passed to the ``chi_sq_exprs`` argument. The following row-indexed fields are included in the table: * **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The returned table is keyed by this field. See the notes below for details on the possible values of this field. * **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared test statistic for the given phenotype. * **intercept** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the intercept :math:`1 + Na`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. * **snp_heritability** (`Struct`) -- Contains fields: - **estimate** (:py:data:`.tfloat64`) -- A point estimate of the SNP-heritability :math:`h_g^2`. - **standard_error** (:py:data:`.tfloat64`) -- An estimate of the standard error of this point estimate. Warning ------- :func:`.ld_score_regression` considers only the rows for which both row fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing values in either field are removed prior to fitting the LD score regression model. Parameters ---------- weight_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used to derive variant weights in the model. ld_score_expr : :class:`.Float64Expression` Row-indexed expression for the LD scores used as covariates in the model. chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of :class:`.Float64Expression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions for chi-squared statistics resulting from genome-wide association studies. n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of :class:`.NumericExpression` One or more row-indexed (if table) or entry-indexed (if matrix table) expressions indicating the number of samples used in the studies that generated the test statistics supplied to ``chi_sq_exprs``. n_blocks : :obj:`int` The number of blocks used in the jackknife approach to estimating standard errors. two_step_threshold : :obj:`int` Variants with chi-squared statistics greater than this value are excluded in the first step of the two-step procedure used to fit the model. n_reference_panel_variants : :obj:`int`, optional Number of variants used to estimate the SNP-heritability :math:`h_g^2`. Returns ------- :class:`.Table` Table keyed by ``phenotype`` with intercept and heritability estimates for each phenotype passed to the function.""" chi_sq_exprs = wrap_to_list(chi_sq_exprs) n_samples_exprs = wrap_to_list(n_samples_exprs) assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or (len(n_samples_exprs) == 1)) __k = 2 # number of covariates, including intercept ds = chi_sq_exprs[0]._indices.source analyze('ld_score_regression/weight_expr', weight_expr, ds._row_indices) analyze('ld_score_regression/ld_score_expr', ld_score_expr, ds._row_indices) # format input dataset if isinstance(ds, MatrixTable): if len(chi_sq_exprs) != 1: raise ValueError("""Only one chi_sq_expr allowed if originating from a matrix table.""") if len(n_samples_exprs) != 1: raise ValueError("""Only one n_samples_expr allowed if originating from a matrix table.""") col_key = list(ds.col_key) if len(col_key) != 1: raise ValueError("""Matrix table must be keyed by a single phenotype field.""") analyze('ld_score_regression/chi_squared_expr', chi_sq_exprs[0], ds._entry_indices) analyze('ld_score_regression/n_samples_expr', n_samples_exprs[0], ds._entry_indices) ds = ds._select_all(row_exprs={'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__w_initial_floor': hl.max(weight_expr, 1.0), '__x': ld_score_expr, '__x_floor': hl.max(ld_score_expr, 1.0)}, row_key=['__locus', '__alleles'], col_exprs={'__y_name': ds[col_key[0]]}, col_key=['__y_name'], entry_exprs={'__y': chi_sq_exprs[0], '__n': n_samples_exprs[0]}) ds = ds.annotate_entries(**{'__w': ds.__w_initial}) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) else: assert isinstance(ds, Table) for y in chi_sq_exprs: analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices) for n in n_samples_exprs: analyze('ld_score_regression/n_samples_expr', n, ds._row_indices) ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)] ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)] ds = ds.select(**dict(**{'__locus': ds.locus, '__alleles': ds.alleles, '__w_initial': weight_expr, '__x': ld_score_expr}, **{y: chi_sq_exprs[i] for i, y in enumerate(ys)}, **{w: weight_expr for w in ws}, **{n: n_samples_exprs[i] for i, n in enumerate(ns)})) ds = ds.key_by(ds.__locus, ds.__alleles) table_tmp_file = new_temp_file() ds.write(table_tmp_file) ds = hl.read_table(table_tmp_file) hts = [ds.select(**{'__w_initial': ds.__w_initial, '__w_initial_floor': hl.max(ds.__w_initial, 1.0), '__x': ds.__x, '__x_floor': hl.max(ds.__x, 1.0), '__y_name': i, '__y': ds[ys[i]], '__w': ds[ws[i]], '__n': hl.int(ds[ns[i]])}) for i, y in enumerate(ys)] mts = [ht.to_matrix_table(row_key=['__locus', '__alleles'], col_key=['__y_name'], row_fields=['__w_initial', '__w_initial_floor', '__x', '__x_floor']) for ht in hts] ds = mts[0] for i in range(1, len(ys)): ds = ds.union_cols(mts[i]) ds = ds.filter_rows(hl.is_defined(ds.__locus) & hl.is_defined(ds.__alleles) & hl.is_defined(ds.__w_initial) & hl.is_defined(ds.__x)) mt_tmp_file1 = new_temp_file() ds.write(mt_tmp_file1) mt = hl.read_matrix_table(mt_tmp_file1) if not n_reference_panel_variants: M = mt.count_rows() else: M = n_reference_panel_variants # block variants for each phenotype n_phenotypes = mt.count_cols() mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) & (mt.__y < two_step_threshold)), __in_step2=hl.is_defined(mt.__y)) mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()), __m_step1=hl.agg.count_where(mt.__in_step1), __m_step2=hl.agg.count_where(mt.__in_step2)) col_keys = list(mt.col_key) ht = mt.localize_entries(entries_array_field_name='__entries', columns_array_field_name='__cols') ht = ht.annotate(__entries=hl.rbind( hl.scan.array_agg( lambda entry: hl.scan.count_where(entry.__in_step1), ht.__entries), lambda step1_indices: hl.map( lambda i: hl.rbind( hl.int(hl.or_else(step1_indices[i], 0)), ht.__cols[i].__m_step1, ht.__entries[i], lambda step1_idx, m_step1, entry: hl.rbind( hl.map( lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))), hl.range(0, n_blocks + 1)), lambda step1_separators: hl.rbind( hl.set(step1_separators).contains(step1_idx), hl.sum( hl.map( lambda s1: step1_idx >= s1, step1_separators)) - 1, lambda is_separator, step1_block: entry.annotate( __step1_block=step1_block, __step2_block=hl.cond(~entry.__in_step1 & is_separator, step1_block - 1, step1_block))))), hl.range(0, hl.len(ht.__entries))))) mt = ht._unlocalize_entries('__entries', '__cols', col_keys) mt_tmp_file2 = new_temp_file() mt.write(mt_tmp_file2) mt = hl.read_matrix_table(mt_tmp_file2) # initial coefficient estimates mt = mt.annotate_cols(__initial_betas=[ 1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)]) mt = mt.annotate_cols(__step1_betas=mt.__initial_betas, __step2_betas=mt.__initial_betas) # step 1 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step1, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] + mt.__step1_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step1_betas=hl.agg.filter( mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta)) mt = mt.annotate_cols(__step1_h2=hl.max(hl.min( mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step1_betas=[ mt.__step1_betas[0], mt.__step1_h2 * hl.agg.mean(mt.__n) / M]) # step 1 block jackknife mt = mt.annotate_cols(__step1_block_betas=[ hl.agg.filter((mt.__step1_block != i) & mt.__in_step1, hl.agg.linreg(y=mt.__y, x=[1.0, mt.__x], weight=mt.__w).beta) for i in range(n_blocks)]) mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x, mt.__step1_block_betas)) mt = mt.annotate_cols( __step1_jackknife_mean=hl.map( lambda i: hl.mean( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected)), hl.range(0, __k)), __step1_jackknife_variance=hl.map( lambda i: (hl.sum( hl.map(lambda x: x[i]**2, mt.__step1_block_betas_bias_corrected)) - hl.sum( hl.map(lambda x: x[i], mt.__step1_block_betas_bias_corrected))**2 / n_blocks) / (n_blocks - 1) / n_blocks, hl.range(0, __k))) # step 2 iteratively reweighted least squares for i in range(3): mt = mt.annotate_entries(__w=hl.cond( mt.__in_step2, 1.0/(mt.__w_initial_floor * 2.0 * (mt.__step2_betas[0] + mt.__step2_betas[1] * mt.__x_floor)**2), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], hl.agg.filter(mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0])]) mt = mt.annotate_cols(__step2_h2=hl.max(hl.min( mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0)) mt = mt.annotate_cols(__step2_betas=[ mt.__step1_betas[0], mt.__step2_h2 * hl.agg.mean(mt.__n)/M]) # step 2 block jackknife mt = mt.annotate_cols(__step2_block_betas=[ hl.agg.filter((mt.__step2_block != i) & mt.__in_step2, hl.agg.linreg(y=mt.__y - mt.__step1_betas[0], x=[mt.__x], weight=mt.__w).beta[0]) for i in range(n_blocks)]) mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map( lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x, mt.__step2_block_betas)) mt = mt.annotate_cols( __step2_jackknife_mean=hl.mean( mt.__step2_block_betas_bias_corrected), __step2_jackknife_variance=( hl.sum(mt.__step2_block_betas_bias_corrected**2) - hl.sum(mt.__step2_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks) # combine step 1 and step 2 block jackknifes mt = mt.annotate_entries( __step2_initial_w=1.0/(mt.__w_initial_floor * 2.0 * (mt.__initial_betas[0] + mt.__initial_betas[1] * mt.__x_floor)**2)) mt = mt.annotate_cols( __final_betas=[ mt.__step1_betas[0], mt.__step2_betas[1]], __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) / hl.agg.sum(mt.__step2_initial_w * mt.__x**2))) mt = mt.annotate_cols(__final_block_betas=hl.map( lambda i: (mt.__step2_block_betas[i] - mt.__c * (mt.__step1_block_betas[i][0] - mt.__final_betas[0])), hl.range(0, n_blocks))) mt = mt.annotate_cols( __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] - (n_blocks - 1) * mt.__final_block_betas)) mt = mt.annotate_cols( __final_jackknife_mean=[ mt.__step1_jackknife_mean[0], hl.mean(mt.__final_block_betas_bias_corrected)], __final_jackknife_variance=[ mt.__step1_jackknife_variance[0], (hl.sum(mt.__final_block_betas_bias_corrected**2) - hl.sum(mt.__final_block_betas_bias_corrected)**2 / n_blocks) / (n_blocks - 1) / n_blocks]) # convert coefficient to heritability estimate mt = mt.annotate_cols( phenotype=mt.__y_name, mean_chi_sq=hl.agg.mean(mt.__y), intercept=hl.struct( estimate=mt.__final_betas[0], standard_error=hl.sqrt(mt.__final_jackknife_variance[0])), snp_heritability=hl.struct( estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1], standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 * mt.__final_jackknife_variance[1]))) # format and return results ht = mt.cols() ht = ht.key_by(ht.phenotype) ht = ht.select(ht.mean_chi_sq, ht.intercept, ht.snp_heritability) ht_tmp_file = new_temp_file() ht.write(ht_tmp_file) ht = hl.read_table(ht_tmp_file) return ht
def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]: r"""Find Mendel errors; count per variant, individual and nuclear family. .. include:: ../_templates/req_tstring.rst .. include:: ../_templates/req_tvariant.rst .. include:: ../_templates/req_biallelic.rst Examples -------- Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in a pedigree and return four tables (all errors, errors by family, errors by individual, errors by variant): >>> ped = hl.Pedigree.read('data/trios.fam') >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped) Export all mendel errors to a text file: >>> all_errors.export('output/all_mendel_errors.tsv') Annotate columns with the number of Mendel errors: >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s]) Annotate rows with the number of Mendel errors: >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles]) Notes ----- The example above returns four tables, which contain Mendelian violations grouped in various ways. These tables are modeled after the `PLINK mendel formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats, respectively. **First table:** all Mendel errors. This table contains one row per Mendel error, keyed by the variant and proband id. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field. - `fam_id` (:py:data:`.tstr`) -- Family ID. - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below. **Second table:** errors per nuclear family. This table contains one row per nuclear family, keyed by the parents. - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field) - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field) - `fam_id` (:py:data:`.tstr`) -- Family ID. - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this nuclear family. **Third table:** errors per individual. This table contains one row per individual. Each error is counted toward the proband, father, and mother according to the `Implicated` in the table below. - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field). - `fam_id` (:py:data:`.tstr`) -- Family ID. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual. - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this individual at SNPs. **Fourth table:** errors per variant. - `locus` (:class:`.tlocus`) -- Variant locus, key field. - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field. - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant. This method only considers complete trios (two parents and proband with defined sex). The code of each Mendel error is determined by the table below, extending the `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__. In the table, the copy state of a locus with respect to a trio is defined as follows, where PAR is the `pseudoautosomal region <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y defined by the reference genome and the autosome is defined by :meth:`~hail.genetics.Locus.in_autosome`. - Auto -- in autosome or in PAR or female child - HemiX -- in non-PAR of X and male child - HemiY -- in non-PAR of Y and male child `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~` denotes complement in this set. +------+---------+---------+--------+----------------------------+ | Code | Dad | Mom | Kid | Copy State | Implicated | +======+=========+=========+========+============+===============+ | 1 | HomVar | HomVar | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 2 | HomRef | HomRef | Het | Auto | Dad, Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 3 | HomRef | ~HomRef | HomVar | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 4 | ~HomRef | HomRef | HomVar | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 5 | HomRef | HomRef | HomVar | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 6 | HomVar | ~HomVar | HomRef | Auto | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 7 | ~HomVar | HomVar | HomRef | Auto | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 8 | HomVar | HomVar | HomRef | Auto | Kid | +------+---------+---------+--------+------------+---------------+ | 9 | Any | HomVar | HomRef | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 10 | Any | HomRef | HomVar | HemiX | Mom, Kid | +------+---------+---------+--------+------------+---------------+ | 11 | HomVar | Any | HomRef | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ | 12 | HomRef | Any | HomVar | HemiY | Dad, Kid | +------+---------+---------+--------+------------+---------------+ See Also -------- :func:`.mendel_error_code` Parameters ---------- dataset : :class:`.MatrixTable` pedigree : :class:`.Pedigree` Returns ------- (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`) """ source = call._indices.source if not isinstance(source, MatrixTable): raise ValueError("'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}".format( "expression of '{}'".format(source.__class__) if source is not None else 'scalar expression')) source = source.select_entries(__GT=call) dataset = require_biallelic(source, 'mendel_errors') tm = trio_matrix(dataset, pedigree, complete_trios=True) tm = tm.select_entries(mendel_code=hl.mendel_error_code( tm.locus, tm.is_female, tm.father_entry['__GT'], tm.mother_entry['__GT'], tm.proband_entry['__GT'] )) ck_name = next(iter(source.col_key)) tm = tm.filter_entries(hl.is_defined(tm.mendel_code)) tm = tm.rename({'id' : ck_name}) entries = tm.entries() table1 = entries.select('fam_id', 'mendel_code') fam_counts = ( entries .group_by(pat_id=entries.father[ck_name], mat_id=entries.mother[ck_name]) .partition_hint(min(entries.n_partitions(), 8)) .aggregate(children=hl.len(hl.agg.collect_as_set(entries[ck_name])), errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)), snp_errors=hl.agg.count_where(hl.is_snp(entries.alleles[0], entries.alleles[1]) & hl.is_defined(entries.mendel_code))) ) table2 = tm.key_cols_by().cols() table2 = table2.select(pat_id=table2.father[ck_name], mat_id=table2.mother[ck_name], fam_id=table2.fam_id, **fam_counts[table2.father[ck_name], table2.mother[ck_name]]) table2 = table2.key_by('pat_id', 'mat_id').distinct() table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)), snp_errors=hl.or_else(table2.snp_errors, hl.int64(0))) # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child implicated = hl.literal([ [0, 0, 0], # dummy [1, 1, 1], [1, 1, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [1, 0, 1], [0, 1, 1], [0, 0, 1], [0, 1, 1], [0, 1, 1], [1, 0, 1], [1, 0, 1], ], dtype=hl.tarray(hl.tarray(hl.tint64))) table3 = tm.annotate_cols(all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]), snp_errors=hl.or_else( hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]), hl.agg.array_sum(implicated[tm.mendel_code])), [0, 0, 0])).key_cols_by().cols() table3 = table3.select(xs=[ hl.struct(**{ck_name: table3.father[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[0], 'snp_errors': table3.snp_errors[0]}), hl.struct(**{ck_name: table3.mother[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[1], 'snp_errors': table3.snp_errors[1]}), hl.struct(**{ck_name: table3.proband[ck_name], 'fam_id': table3.fam_id, 'errors': table3.all_errors[2], 'snp_errors': table3.snp_errors[2]}), ]) table3 = table3.explode('xs') table3 = table3.select(**table3.xs) table3 = (table3.group_by(ck_name, 'fam_id') .aggregate(errors=hl.agg.sum(table3.errors), snp_errors=hl.agg.sum(table3.snp_errors)) .key_by(ck_name)) table4 = tm.select_rows(errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows() return table1, table2, table3, table4