Beispiel #1
0
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
Beispiel #2
0
 def test_mendel_errors(self):
     dataset = self.get_dataset()
     men, fam, ind, var = hl.mendel_errors(dataset, hl.Pedigree.read(resource('sample.fam')))
     men.select('fam_id', 's', 'code')
     fam.select('pat_id', 'children')
     self.assertEqual(list(ind.key), ['s'])
     self.assertEqual(list(var.key), ['locus', 'alleles'])
     dataset.annotate_rows(mendel=var[dataset.locus, dataset.alleles]).count_rows()
Beispiel #3
0
def family_stats(mt: hl.MatrixTable, ped: hl.Pedigree, group_name: str) -> Tuple[hl.expr.StructExpression, hl.Table]:
    tdt_table = hl.transmission_disequilibrium_test(mt, ped)
    _, _, per_sample, per_variant = hl.mendel_errors(mt.GT, ped)
    family_stats_struct = hl.struct(mendel=per_variant[mt.row_key],
                                    tdt=tdt_table[mt.row_key],
                                    unrelated_qc_callstats=hl.agg.filter(mt.unrelated_sample,
                                                                         hl.agg.call_stats(mt.GT, mt.alleles)),
                                    meta={'group': group_name})
    return family_stats_struct, per_sample
Beispiel #4
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
Beispiel #5
0
    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2,
                                             errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1,
                                             errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))
Beispiel #6
0
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float,
                old_version: str, overwrite: bool) -> None:
    """
    Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file.
    In order to compare the raw ped, metadata is also generated for:
    1) A number of fake families are generated
    2) The previous iteration of the ped file (old_version)

    :param GnomADRelatedData related_data: Input data
    :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data
    :param str old_version: Version of previous iteration to load
    :param bool overwrite: Whether to overwrite previous data
    :return: Nothing
    :rtype: None
    """

    raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type),
                               delimiter="\\t")

    n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios()))
    logger.info(
        f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}"
    )
    fake_fams = create_fake_pedigree(n_fake_trios,
                                     list(related_data.meta_pd.s), raw_ped)

    fake_fams.write(fake_fam_path(related_data.data_type))

    logger.info(f"Running mendel_errors on {related_data.data_type}")

    # Run mendel errors on families made of random samples to establish expectation in non-trios:
    pedigrees = [('new', raw_ped),
                 ('old',
                  hl.Pedigree.read(fam_path(related_data.data_type,
                                            version=old_version),
                                   delimiter="\\t")),
                 ('fake',
                  hl.Pedigree.read(fake_fam_path(related_data.data_type),
                                   delimiter="\\t"))]

    ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped))
                                    for name, ped in pedigrees],
                                   related_data.sample_to_dups, True)

    # Run mendel_errors
    all_ped = pandas_to_ped(ped_pd)
    gnomad = get_gnomad_data(related_data.data_type)
    fam_samples = hl.literal({
        s
        for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id]
    })
    gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s))
    all_errors, per_fam, per_sample, _ = hl.mendel_errors(
        gnomad['GT'], all_ped)

    all_errors.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "all_errors"),
                     overwrite=overwrite)
    per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"),
                  overwrite=overwrite)
    per_sample.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "per_sample"),
                     overwrite=overwrite)

    # Merge all metadata
    ped_pd = add_pedigree_meta(ped_pd=ped_pd,
                               meta_pd=related_data.meta_pd,
                               kin_ht=related_data.kin_ht,
                               mendel_per_sample_ht=per_sample)

    # Write merged pedigrees as HT
    sql_context = SQLContext(hl.spark_context())
    hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write(
        merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
Beispiel #7
0
def compute_mendel_denovos(mt, pedigree):
    all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(mt.GT, pedigree)
    mendel_de_novos = all_errors.filter(hl.literal({1,2,5,8}).contains(all_errors.mendel_code))
    return mendel_de_novos