def run_mendel_errors() -> hl.Table: meta_ht = meta.ht() ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree() logger.info(f"Running Mendel errors for {len(ped.trios)} trios.") fake_ped = create_fake_pedigree( n=100, sample_list=list( meta_ht.aggregate( hl.agg.filter( hl.rand_bool(0.01) & ((hl.len(meta_ht.qc_metrics_filters) == 0) & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)), hl.agg.collect_as_set(meta_ht.s), ))), real_pedigree=ped, ) merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios) ped_samples = hl.literal( set([ s for trio in merged_ped.trios for s in [trio.s, trio.pat_id, trio.mat_id] ])) mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True) mt = mt.filter_cols(ped_samples.contains(mt.s)) mt = hl.filter_intervals( mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')]) mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True) mt = mt.select_entries("GT", "END") mt = hl.experimental.densify(mt) mt = mt.filter_rows(hl.len(mt.alleles) == 2) mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped) return mendel_errors
def test_mendel_errors(self): dataset = self.get_dataset() men, fam, ind, var = hl.mendel_errors(dataset, hl.Pedigree.read(resource('sample.fam'))) men.select('fam_id', 's', 'code') fam.select('pat_id', 'children') self.assertEqual(list(ind.key), ['s']) self.assertEqual(list(var.key), ['locus', 'alleles']) dataset.annotate_rows(mendel=var[dataset.locus, dataset.alleles]).count_rows()
def family_stats(mt: hl.MatrixTable, ped: hl.Pedigree, group_name: str) -> Tuple[hl.expr.StructExpression, hl.Table]: tdt_table = hl.transmission_disequilibrium_test(mt, ped) _, _, per_sample, per_variant = hl.mendel_errors(mt.GT, ped) family_stats_struct = hl.struct(mendel=per_variant[mt.row_key], tdt=tdt_table[mt.row_key], unrelated_qc_callstats=hl.agg.filter(mt.unrelated_sample, hl.agg.call_stats(mt.GT, mt.alleles)), meta={'group': group_name}) return family_stats_struct, per_sample
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual(set(fam.select('errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', errors=0, snp_errors=0) }) self.assertEqual(set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([ (hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T']) ]) self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles))) .order_by('locus') .select('locus', 'alleles', 'errors').collect(), [ hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def test_mendel_errors(self): mt = hl.import_vcf(resource('mendel.vcf')) ped = hl.Pedigree.read(resource('mendel.fam')) men, fam, ind, var = hl.mendel_errors(mt['GT'], ped) self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr)) self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), s=hl.tstr, fam_id=hl.tstr, mendel_code=hl.tint)) self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr)) self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr, mat_id=hl.tstr, fam_id=hl.tstr, children=hl.tint, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr)) self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr, fam_id=hl.tstr, errors=hl.tint64, snp_errors=hl.tint64)) self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr))) self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype, alleles=hl.tarray(hl.tstr), errors=hl.tint64)) self.assertEqual(men.count(), 41) self.assertEqual(fam.count(), 2) self.assertEqual(ind.count(), 7) self.assertEqual(var.count(), mt.count_rows()) self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()), { hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2, errors=41, snp_errors=39), hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1, errors=0, snp_errors=0) }) self.assertEqual(set(ind.select('errors', 'snp_errors').collect()), { hl.utils.Struct(s='Son1', errors=23, snp_errors=22), hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17), hl.utils.Struct(s='Dad1', errors=19, snp_errors=18), hl.utils.Struct(s='Mom1', errors=22, snp_errors=21), hl.utils.Struct(s='Dad2', errors=0, snp_errors=0), hl.utils.Struct(s='Mom2', errors=0, snp_errors=0), hl.utils.Struct(s='Son2', errors=0, snp_errors=0) }) to_keep = hl.set([ (hl.Locus("1", 1), ['C', 'CT']), (hl.Locus("1", 2), ['C', 'T']), (hl.Locus("X", 1), ['C', 'T']), (hl.Locus("X", 3), ['C', 'T']), (hl.Locus("Y", 1), ['C', 'T']), (hl.Locus("Y", 3), ['C', 'T']) ]) self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles))) .order_by('locus') .select('locus', 'alleles', 'errors').collect(), [ hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2), hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2), hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1), hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1), ]) ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam')) men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2) self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float, old_version: str, overwrite: bool) -> None: """ Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file. In order to compare the raw ped, metadata is also generated for: 1) A number of fake families are generated 2) The previous iteration of the ped file (old_version) :param GnomADRelatedData related_data: Input data :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data :param str old_version: Version of previous iteration to load :param bool overwrite: Whether to overwrite previous data :return: Nothing :rtype: None """ raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type), delimiter="\\t") n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios())) logger.info( f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}" ) fake_fams = create_fake_pedigree(n_fake_trios, list(related_data.meta_pd.s), raw_ped) fake_fams.write(fake_fam_path(related_data.data_type)) logger.info(f"Running mendel_errors on {related_data.data_type}") # Run mendel errors on families made of random samples to establish expectation in non-trios: pedigrees = [('new', raw_ped), ('old', hl.Pedigree.read(fam_path(related_data.data_type, version=old_version), delimiter="\\t")), ('fake', hl.Pedigree.read(fake_fam_path(related_data.data_type), delimiter="\\t"))] ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped)) for name, ped in pedigrees], related_data.sample_to_dups, True) # Run mendel_errors all_ped = pandas_to_ped(ped_pd) gnomad = get_gnomad_data(related_data.data_type) fam_samples = hl.literal({ s for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id] }) gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s)) all_errors, per_fam, per_sample, _ = hl.mendel_errors( gnomad['GT'], all_ped) all_errors.write(sample_qc_mendel_ht_path(related_data.data_type, "all_errors"), overwrite=overwrite) per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"), overwrite=overwrite) per_sample.write(sample_qc_mendel_ht_path(related_data.data_type, "per_sample"), overwrite=overwrite) # Merge all metadata ped_pd = add_pedigree_meta(ped_pd=ped_pd, meta_pd=related_data.meta_pd, kin_ht=related_data.kin_ht, mendel_per_sample_ht=per_sample) # Write merged pedigrees as HT sql_context = SQLContext(hl.spark_context()) hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write( merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def compute_mendel_denovos(mt, pedigree): all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(mt.GT, pedigree) mendel_de_novos = all_errors.filter(hl.literal({1,2,5,8}).contains(all_errors.mendel_code)) return mendel_de_novos