def get_pbt_trio_ht(data_type: str):

    # Keep a single proband from each family with > 1  proband.
    meta = get_gnomad_meta(data_type)
    hq_samples = hl.literal(
        meta.aggregate(
            hl.agg.filter(
                meta.high_quality &
                (meta.project_id != BAD_THAI_TRIOS_PROJECT_ID),
                hl.agg.collect(meta.s))))
    fam_ht = hl.import_fam(fam_path(data_type), delimiter='\\t')
    fam_ht = fam_ht.filter(
        hq_samples.contains(fam_ht.id) & hq_samples.contains(fam_ht.pat_id)
        & hq_samples.contains(fam_ht.mat_id))
    fam_ht = fam_ht.key_by('pat_id').distinct()
    fam_ht = fam_ht.key_by('mat_id').distinct()
    fam_ht = fam_ht.annotate(
        s=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]).explode('s')
    fam_ht = fam_ht.key_by('s', 'id')

    return fam_ht
Example #2
0
def create_ped(related_data: GnomADRelatedData,
               new_version: str,
               max_mv_z: int = 3):
    """

    Loads the raw gnomAD ped, applies Mendelian Violations cutoff in order to produce a final ped file and writes final gnomAD ped file.

    :param GnomADRelatedData related_data: Input data
    :param str new_version: String containing the new version name to write the data to
    :param int max_mv_z: Max number of std devs above the mean number of MVs in inferred trios to keep trio.
    :return: Nothing
    :rtype: None
    """
    raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type),
                               delimiter="\\t")
    logger.info(
        f"Loaded raw {related_data.data_type} pedigree containing {len(raw_ped.trios)} trios"
    )

    # Filter families
    ped_meta = hl.read_table(merged_pedigrees_ht_path(
        related_data.data_type)).to_pandas()

    ped_meta = ped_meta[ped_meta.ped_name.str.contains('new')]
    mean_errors = np.mean(ped_meta.errors)
    std_errors = np.std(ped_meta.errors)

    filtered_s = set(
        ped_meta[ped_meta.errors > mean_errors + max_mv_z * std_errors].s)

    # Write final fam file
    final_ped = hl.Pedigree(
        [trio for trio in raw_ped.trios if trio.s not in filtered_s])
    final_ped.write(fam_path(related_data.data_type, version=new_version))

    logger.info(
        f"Wrote final {related_data.data_type} pedigree with {len(final_ped.trios)} trios."
    )
Example #3
0
def create_meta(related_data: GnomADRelatedData, fake_fam_prop: float,
                old_version: str, overwrite: bool) -> None:
    """
    Creates and writes a dataframe with metadata to evaluate gnomAD trios from the raw ped file.
    In order to compare the raw ped, metadata is also generated for:
    1) A number of fake families are generated
    2) The previous iteration of the ped file (old_version)

    :param GnomADRelatedData related_data: Input data
    :param float fake_fam_prop: Number of fake trios to generate as a proportion of the number of real families in the data
    :param str old_version: Version of previous iteration to load
    :param bool overwrite: Whether to overwrite previous data
    :return: Nothing
    :rtype: None
    """

    raw_ped = hl.Pedigree.read(raw_fam_path(related_data.data_type),
                               delimiter="\\t")

    n_fake_trios = int(fake_fam_prop * len(raw_ped.complete_trios()))
    logger.info(
        f"Generating fake pedigree with {n_fake_trios} trios for {related_data.data_type}"
    )
    fake_fams = create_fake_pedigree(n_fake_trios,
                                     list(related_data.meta_pd.s), raw_ped)

    fake_fams.write(fake_fam_path(related_data.data_type))

    logger.info(f"Running mendel_errors on {related_data.data_type}")

    # Run mendel errors on families made of random samples to establish expectation in non-trios:
    pedigrees = [('new', raw_ped),
                 ('old',
                  hl.Pedigree.read(fam_path(related_data.data_type,
                                            version=old_version),
                                   delimiter="\\t")),
                 ('fake',
                  hl.Pedigree.read(fake_fam_path(related_data.data_type),
                                   delimiter="\\t"))]

    ped_pd = merge_pedigree_pandas([(name, ped_to_pandas(ped))
                                    for name, ped in pedigrees],
                                   related_data.sample_to_dups, True)

    # Run mendel_errors
    all_ped = pandas_to_ped(ped_pd)
    gnomad = get_gnomad_data(related_data.data_type)
    fam_samples = hl.literal({
        s
        for trio in all_ped.trios for s in [trio.s, trio.mat_id, trio.pat_id]
    })
    gnomad = gnomad.filter_cols(fam_samples.contains(gnomad.s))
    all_errors, per_fam, per_sample, _ = hl.mendel_errors(
        gnomad['GT'], all_ped)

    all_errors.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "all_errors"),
                     overwrite=overwrite)
    per_fam.write(sample_qc_mendel_ht_path(related_data.data_type, "per_fam"),
                  overwrite=overwrite)
    per_sample.write(sample_qc_mendel_ht_path(related_data.data_type,
                                              "per_sample"),
                     overwrite=overwrite)

    # Merge all metadata
    ped_pd = add_pedigree_meta(ped_pd=ped_pd,
                               meta_pd=related_data.meta_pd,
                               kin_ht=related_data.kin_ht,
                               mendel_per_sample_ht=per_sample)

    # Write merged pedigrees as HT
    sql_context = SQLContext(hl.spark_context())
    hl.Table.from_spark(sql_context.createDataFrame(ped_pd)).write(
        merged_pedigrees_ht_path(related_data.data_type), overwrite=overwrite)
def main(args):
    data_type = 'exomes' if args.exomes else 'genomes'

    if args.pbt_tm:
        mt = get_gnomad_data(data_type, split=False)
        meta = mt.cols()
        hq_samples = meta.aggregate(
            hl.agg.filter(meta.meta.high_quality, hl.agg.collect(meta.s)))
        ped = hl.Pedigree.read(fam_path(data_type),
                               delimiter='\\t').filter_to(hq_samples)
        ped_samples = hl.literal(
            set([
                s for trio in ped.complete_trios()
                for s in [trio.s, trio.pat_id, trio.mat_id]
            ]))

        mt = mt.filter_cols(ped_samples.contains(mt.s))
        mt = mt.select_cols().select_rows()
        mt = mt.filter_rows(hl.agg.any(mt.GT.is_non_ref()))

        tm = hl.trio_matrix(mt, ped, complete_trios=True)
        tm = hl.experimental.phase_trio_matrix_by_transmission(tm)
        tm.write(pbt_phased_trios_mt_path(data_type,
                                          split=False,
                                          trio_matrix=True),
                 overwrite=args.overwrite)

    if args.pbt_explode:
        tm = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False, trio_matrix=True))

        tm = tm.annotate_entries(trio_adj=tm.proband_entry.adj
                                 & tm.father_entry.adj & tm.mother_entry.adj)
        pmt = explode_trio_matrix(tm, keep_trio_entries=True)
        pmt = pmt.transmute_entries(trio_adj=pmt.source_trio_entry.trio_adj)
        pmt.write(pbt_phased_trios_mt_path(data_type, split=False),
                  overwrite=args.overwrite)

        pmt = hl.read_matrix_table(
            pbt_phased_trios_mt_path(data_type, split=False))
        pmt = pmt.rename({'PBT_GT':
                          'PGT'})  # ugly but supported by hl.split_multi_hts
        pmt = hl.split_multi_hts(pmt)
        pmt = pmt.rename({'PGT': 'PBT_GT'})
        pmt.write(pbt_phased_trios_mt_path(data_type),
                  overwrite=args.overwrite)

    if args.phase_multi_families:
        pbt = hl.read_matrix_table(pbt_phased_trios_mt_path(data_type))
        # Keep samples that:
        # 1. There are more than one entry in the Matrix (i.e. they are part of multiple trios)
        # 2. In all their entries, the parents are the same (there are only two exceptions to this, so best to ignore these and focus on parents/multi-offspring families)
        nt_samples = pbt.cols()
        nt_samples = nt_samples.group_by('s').aggregate(
            trios=hl.agg.collect(nt_samples.source_trio))
        nt_samples = nt_samples.filter(
            (hl.len(nt_samples.trios) > 1) &
            nt_samples.trios[1:].any(lambda x: (x.mother.s != nt_samples.trios[
                0].mother.s) | (x.father.s != nt_samples.trios[0].father.s)),
            keep=False)
        pbt = pbt.filter_cols(hl.is_defined(nt_samples[pbt.col_key]))

        # Group cols for these samples, keeping all GTs in an array
        # Compute the consensus GT (incl. phase) + QC metrics based on (a) phased genotypes have priority, (b) genotypes with most votes
        pbt = pbt.group_cols_by('s').aggregate(PBT_GTs=hl.agg.filter(
            hl.is_defined(pbt.GT), hl.agg.collect(pbt.GT)))
        gt_counter = hl.sorted(hl.array(
            pbt.PBT_GTs.group_by(lambda x: x).map_values(lambda x: hl.len(x))),
                               key=lambda x: x[0].phased * 100 + x[1],
                               reverse=True)
        phased_gt_counts = gt_counter.filter(lambda x: x[0].phased).map(
            lambda x: x[1])
        pbt = pbt.annotate_entries(
            consensus_gt=gt_counter.map(lambda x: x[0]).find(lambda x: True),
            phase_concordance=phased_gt_counts.find(lambda x: True) /
            hl.sum(phased_gt_counts),
            discordant_gts=hl.len(
                hl.set(
                    pbt.PBT_GTs.map(lambda x: hl.cond(
                        x.phased, hl.call(x[0], x[1]), x)))) > 1)
        pbt.write('gs://gnomad/projects/compound_hets/pbt_multi_families.mt')