コード例 #1
0
ファイル: sample_qc.py プロジェクト: Nealelab/ccdg_qc
def get_qc_vds(
    data_type: str = "genomes",
    split: bool = False,
    autosome_only: bool = False,
    interval_qc: bool = False,
) -> hl.vds.VariantDataset:
    """
    Wrapper function to get ccdg vds with desired filtering
    :param data_type: Whether data is from genomes or exomes, default is genomes
    :param split: Perform split on VDS, default is False
    :param autosome_only: Whether to filter to variants in autosome, default is False
    :param interval_qc: Whether to filter to high quality intervals for exomes QC, default is False
    :return: ccdg vds with chosen filters
    """
    vds = get_ccdg_vds(data_type)
    if data_type == "exomes" and interval_qc:
        logger.info("Filtering CCDG exomes VDS to high quality intervals...")
        int_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result=f"intervals_{INTERVAL_DP}x")
        )
        int_ht = int_ht.filter(int_ht['to_keep'])
        vds = hl.vds.filter_intervals(
            vds, intervals=int_ht.interval.collect(), keep=True
        )

    if split:
        logger.info("Splitting multi-allelic sites in CCDG %s VDS...", data_type)
        vds = hl.vds.split_multi(vds, filter_changed_loci=True)

    if autosome_only:
        logger.info("Filtering CCDG %s VDS to autosomes...", data_type)
        var_ht = filter_to_autosomes(vds.variant_data).rows()
        vds = hl.vds.filter_variants(vds, var_ht, keep=True)

    return vds
コード例 #2
0
def generate_trio_stats(
    mt: hl.MatrixTable, autosomes_only: bool = True, bi_allelic_only: bool = True
) -> hl.Table:
    """
    Default function to run `generate_trio_stats_expr` to get trio stats stratified by raw and adj
    .. note::
        Expects that `mt` is it a trio matrix table that was annotated with adj and if dealing with
        a sparse MT `hl.experimental.densify` must be run first.
        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.
    :param mt: A Trio Matrix Table returned from `hl.trio_matrix`. Must be dense
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: Table with trio stats
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    logger.info(f"Generating trio stats using {mt.count_cols()} trios.")
    trio_adj = mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj

    ht = mt.select_rows(
        **generate_trio_stats_expr(
            mt,
            transmitted_strata={"raw": True, "adj": trio_adj},
            de_novo_strata={"raw": True, "adj": trio_adj},
            ac_strata={"raw": True, "adj": trio_adj},
        )
    ).rows()

    return ht
コード例 #3
0
ファイル: pipeline.py プロジェクト: ksamocha/gnomad_methods
def generate_sib_stats(
    mt: hl.MatrixTable,
    relatedness_ht: hl.Table,
    i_col: str = "i",
    j_col: str = "j",
    relationship_col: str = "relationship",
    autosomes_only: bool = True,
    bi_allelic_only: bool = True,
) -> hl.Table:
    """
    This is meant as a default wrapper for `generate_sib_stats_expr`. It returns a hail table with counts of variants
    shared by pairs of siblings in `relatedness_ht`.

    This function takes a hail Table with a row for each pair of individuals i,j in the data that are related (it's OK to have unrelated samples too).
    The `relationship_col` should be a column specifying the relationship between each two samples as defined by
    the constants in `gnomad.utils.relatedness`. This relationship_col will be used to filter to only pairs of
    samples that are annotated as `SIBLINGS`.

    .. note::

        By default this pipeline function will filter `mt` to only autosomes and bi-allelic sites.

    :param mt: Input Matrix table
    :param relatedness_ht: Input relationship table
    :param i_col: Column containing the 1st sample of the pair in the relationship table
    :param j_col: Column containing the 2nd sample of the pair in the relationship table
    :param relationship_col: Column containing the relationship for the sample pair as defined in this module constants.
    :param autosomes_only: If set, only autosomal intervals are used.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :return: A Table with the sibling shared variant counts
    """
    if autosomes_only:
        mt = filter_to_autosomes(mt)
    if bi_allelic_only:
        mt = mt.filter_rows(bi_allelic_expr(mt))

    sib_ht = relatedness_ht.filter(
        relatedness_ht[relationship_col] == SIBLINGS)
    s_to_keep = sib_ht.aggregate(
        hl.agg.explode(lambda s: hl.agg.collect_as_set(s),
                       [sib_ht[i_col].s, sib_ht[j_col].s]),
        _localize=False,
    )
    mt = mt.filter_cols(s_to_keep.contains(mt.s))
    if "adj" not in mt.entry:
        mt = annotate_adj(mt)

    sib_stats_ht = mt.select_rows(**generate_sib_stats_expr(
        mt,
        sib_ht,
        i_col=i_col,
        j_col=j_col,
        strata={
            "raw": True,
            "adj": mt.adj
        },
    )).rows()

    return sib_stats_ht
コード例 #4
0
def compute_callrate_mt(
    mt: hl.MatrixTable,
    intervals_ht: hl.Table,
    bi_allelic_only: bool = True,
    autosomes_only: bool = True,
    match: bool = True,
) -> hl.MatrixTable:
    """
    Compute a sample/interval MT with each entry containing the call rate for that sample/interval.

    This can be used as input for imputing exome sequencing platforms.

    .. note::

        The input interval HT should have a key of type Interval.
        The resulting table will have a key of the same type as the `intervals_ht` table and
        contain an `interval_info` field containing all non-key fields of the `intervals_ht`.

    :param mt: Input MT
    :param intervals_ht: Table containing the intervals. This table has to be keyed by locus.
    :param bi_allelic_only: If set, only bi-allelic sites are used for the computation
    :param autosomes_only: If set, only autosomal intervals are used.
    :param matches: If set, returns all intervals in intervals_ht that overlap the locus in the input MT.
    :return: Callrate MT
    """
    logger.info("Computing call rate MatrixTable")

    if len(intervals_ht.key) != 1 or not isinstance(
            intervals_ht.key[0], hl.expr.IntervalExpression):
        logger.warning(
            "Call rate matrix computation expects `intervals_ht` with a key of type Interval. Found: %s",
            intervals_ht.key,
        )

    if autosomes_only:
        callrate_mt = filter_to_autosomes(mt)

    if bi_allelic_only:
        callrate_mt = callrate_mt.filter_rows(bi_allelic_expr(callrate_mt))

    intervals_ht = intervals_ht.annotate(_interval_key=intervals_ht.key)
    callrate_mt = callrate_mt.annotate_rows(_interval_key=intervals_ht.index(
        callrate_mt.locus, all_matches=match)._interval_key)

    if match:
        callrate_mt = callrate_mt.explode_rows("_interval_key")

    callrate_mt = callrate_mt.filter_rows(
        hl.is_defined(callrate_mt._interval_key.interval))
    callrate_mt = callrate_mt.select_entries(
        GT=hl.or_missing(hl.is_defined(callrate_mt.GT), hl.struct()))
    callrate_mt = callrate_mt.group_rows_by(
        **callrate_mt._interval_key).aggregate(
            callrate=hl.agg.fraction(hl.is_defined(callrate_mt.GT)))
    intervals_ht = intervals_ht.drop("_interval_key")
    callrate_mt = callrate_mt.annotate_rows(interval_info=hl.struct(
        **intervals_ht[callrate_mt.row_key]))
    return callrate_mt
コード例 #5
0
def get_qc_samples_filtered_gnomad_data(data_type: str,
                                        autosomes_only: bool = True
                                        ) -> hl.MatrixTable:
    mt = get_gnomad_data(data_type)
    mt = mt.filter_cols(mt.meta.high_quality)
    mt = mt.select_cols(meta=mt.meta.select('qc_platform'))
    mt = mt.select_rows(a_index=mt.a_index, was_split=mt.was_split)
    if autosomes_only:
        mt = filter_to_autosomes(mt)

    return mt
コード例 #6
0
def create_shared_sites_table(data_type: str, overwrite: bool):
    freq_ht = hl.read_table(annotations_ht_path(data_type, 'frequencies'))
    topmed_ht = hl.read_matrix_table(
        'gs://gnomad-public/resources/hail-0.2/topmed.b37.mt').rows()

    methyation_ht = hl.read_table(methylation_sites_ht_path())

    gnomad = get_gnomad_data(data_type,
                             non_refs_only=True,
                             release_samples=True)
    gnomad = gnomad.select_cols(**gnomad.meta)
    gnomad = filter_to_autosomes(gnomad)

    rf_path = annotations_ht_path(data_type, 'rf')
    if hl.hadoop_exists(annotations_ht_path(data_type, 'rf')):
        logger.info(f"Filtering sites based on {rf_path}")
        filter_ht = hl.read_table(rf_path)
        gnomad = gnomad.filter_rows(
            hl.len(filter_ht[gnomad.row_key].filters) == 0)
    else:
        logger.warn(
            f"Could not find filtering table {rf_path}. Not filtering poor quality sites leads to lower performance."
        )

    gnomad = gnomad.filter_rows(
        hl.is_snp(gnomad.alleles[0], gnomad.alleles[1])
        & hl.or_else(methyation_ht[gnomad.locus].MEAN < 0.6, True))

    gnomad = gnomad.annotate_rows(
        topmed_ac=topmed_ht[gnomad.row_key].info.AC[0],
        gnomad_ac=freq_ht[gnomad.row_key].freq[0].AC[1])
    gnomad.annotate_cols(
        n_singletons=hl.agg.count_where((gnomad.gnomad_ac == 1)
                                        & gnomad.GT.is_het()),
        n_doubletons=hl.agg.count_where((gnomad.gnomad_ac < 3)
                                        & gnomad.GT.is_het()),
        n_tripletons=hl.agg.count_where((gnomad.gnomad_ac < 4)
                                        & gnomad.GT.is_het()),
        n_topmed_singletons=hl.agg.count_where(
            (gnomad.gnomad_ac == 1) & gnomad.GT.is_het()
            & hl.is_defined(gnomad.topmed_ac)),
        n_topmed_doubletons=hl.agg.count_where(
            (gnomad.gnomad_ac < 3) & gnomad.GT.is_het()
            & hl.is_defined(gnomad.topmed_ac)),
        n_topmed_tripletons=hl.agg.count_where(
            (gnomad.gnomad_ac < 4) & gnomad.GT.is_het()
            & hl.is_defined(gnomad.topmed_ac)),
        n_both_singletons=hl.agg.count_where(
            (gnomad.gnomad_ac == 1) & gnomad.GT.is_het()
            & (gnomad.topmed_ac == 1))).cols().write(
                get_topmed_shared_sites_ht_path(data_type),
                overwrite=overwrite)
コード例 #7
0
def read_and_pre_process_data(mt_path: str, ht_path: str) -> hl.MatrixTable:
    """
    :param str mt_path: Path to MT to be formatted for joining
    :param str ht_path: Path to HT used to annotate MT
    :return: MatrixTable with uniquified (prefixed) sample IDs that retains previously annotated permissions and hard filters
    :rtype: MatrixTable
    """
    ht = hl.read_table(ht_path).select('data_type', 's', 'hard_filters',
                                       'perm_filters').key_by('s')
    mt = hl.read_matrix_table(mt_path)
    mt = mt.annotate_cols(**ht[mt.s]).key_cols_by('data_type', 's')
    mt = filter_to_autosomes(mt.filter_cols(hl.len(mt.hard_filters) == 0))
    return mt.select_entries('GT')
コード例 #8
0
  def run_hail_sample_qc(mt: hl.MatrixTable, data_type: str) -> hl.MatrixTable:
    """
    Runs Hail's built in sample qc function on the MatrixTable. Splits the MatrixTable in order to calculate inbreeding
    coefficient and annotates the result back onto original MatrixTable. Applies flags by population and platform groups.
    :param MatrixTable mt: QC MatrixTable
    :param str data_type: WGS or WES for write path
    :return: MatrixTable annotated with hails sample qc metrics as well as pop and platform outliers
    :rtype: MatrixTable
    """
    mt = mt.select_entries(mt.GT)
    mt = filter_to_autosomes(mt)
    mt = hl.split_multi_hts(mt)
    mt = hl.sample_qc(mt)
    mt = mt.annotate_cols(
        sample_qc=mt.sample_qc.annotate(
            f_inbreeding=hl.agg.inbreeding(mt.GT, mt.info.AF[0])
        )
    )
    mt = mt.annotate_cols(idx=mt.qc_pop + "_" + hl.str(mt.qc_platform))

    sample_qc = [
        "n_snp",
        "r_ti_tv",
        "r_insertion_deletion",
        "n_insertion",
        "n_deletion",
        "r_het_hom_var",
    ]
    if data_type == "WGS":
        sample_qc = sample_qc + ["call_rate"]

    strat_ht = mt.cols()
    qc_metrics = {metric: strat_ht.sample_qc[metric] for metric in sample_qc}
    strata = {"qc_pop": strat_ht.qc_pop, "qc_platform": strat_ht.qc_platform}

    metric_ht = compute_stratified_metrics_filter(strat_ht, qc_metrics, strata)
    checkpoint_pass = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) == 0)
    )
    logger.info(
        "%i samples found passing pop/platform-specific filtering", checkpoint_pass
    )
    checkpoint_fail = metric_ht.aggregate(
        hl.agg.count_where(hl.len(metric_ht.qc_metrics_filters) != 0)
    )
    logger.info(
        "%i samples found failing pop/platform-specific filtering", checkpoint_fail
    )
    metric_ht = metric_ht.annotate(sample_qc=mt.cols()[metric_ht.key].sample_qc)
    return metric_ht
コード例 #9
0
def run_pca_with_relateds(
    qc_mt: hl.MatrixTable,
    related_samples_to_drop: Optional[hl.Table],
    n_pcs: int = 10,
    autosomes_only: bool = True,
) -> Tuple[List[float], hl.Table, hl.Table]:
    """
    First runs PCA excluding the given related samples,
    then projects these samples in the PC space to return scores for all samples.

    The `related_samples_to_drop` Table has to be keyed by the sample ID and all samples present in this
    table will be excluded from the PCA.

    The loadings Table returned also contains a `pca_af` annotation which is the allele frequency
    used for PCA. This is useful to project other samples in the PC space.

    :param qc_mt: Input QC MT
    :param related_samples_to_drop: Optional table of related samples to drop
    :param n_pcs: Number of PCs to compute
    :param autosomes_only: Whether to run the analysis on autosomes only
    :return: eigenvalues, scores and loadings
    """

    unrelated_mt = qc_mt.persist()

    if autosomes_only:
        unrelated_mt = filter_to_autosomes(unrelated_mt)

    if related_samples_to_drop:
        unrelated_mt = qc_mt.filter_cols(
            hl.is_missing(related_samples_to_drop[qc_mt.col_key]))

    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        unrelated_mt.GT, k=n_pcs, compute_loadings=True)
    pca_af_ht = unrelated_mt.annotate_rows(
        pca_af=hl.agg.mean(unrelated_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_af_ht[pca_loadings.key].pca_af
    )  # TODO: Evaluate if needed to write results at this point if relateds or not

    if not related_samples_to_drop:
        return pca_evals, pca_scores, pca_loadings
    else:
        pca_loadings = pca_loadings.persist()
        pca_scores = pca_scores.persist()
        related_mt = qc_mt.filter_cols(
            hl.is_defined(related_samples_to_drop[qc_mt.col_key]))
        related_scores = pc_project(related_mt, pca_loadings)
        pca_scores = pca_scores.union(related_scores)
        return pca_evals, pca_scores, pca_loadings
コード例 #10
0
def generate_fam_stats(
        mt: hl.MatrixTable,
        fam_file: str
) -> hl.Table:
    """
    Calculate transmission and de novo mutation statistics using trios in the dataset.

    :param mt: Input MatrixTable
    :param fam_file: path to text file containing trio pedigree
    :return: Table containing trio stats
    """
    # Load Pedigree data and filter MT to samples present in any of the trios
    ped = hl.Pedigree.read(fam_file, delimiter="\t")
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id]
    )
    fam_ht = fam_ht.explode('fam_members', name='s')
    fam_ht = fam_ht.key_by('s').select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    logger.info(f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios.")

    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)
    mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj')
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True)
    trio_adj = (mt.proband_entry.adj & mt.father_entry.adj & mt.mother_entry.adj)

    ht = mt.select_rows(
        **generate_trio_stats_expr(
            mt,
            transmitted_strata={
                'raw': True,
                'adj': trio_adj
            },
            de_novo_strata={
                'raw': True,
                'adj': trio_adj,
            },
            proband_is_female_expr=mt.is_female
        )
    ).rows()

    return ht.filter(
        ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0
    )
コード例 #11
0
def generate_fam_stats(mt: hl.MatrixTable, fam_file: str) -> hl.Table:
    # Load Pedigree data and filter MT to samples present in any of the trios
    ped = hl.Pedigree.read(fam_file, delimiter="\t")
    fam_ht = hl.import_fam(fam_file, delimiter="\t")
    fam_ht = fam_ht.annotate(
        fam_members=[fam_ht.id, fam_ht.pat_id, fam_ht.mat_id])
    fam_ht = fam_ht.explode('fam_members', name='s')
    fam_ht = fam_ht.key_by('s').select().distinct()

    mt = mt.filter_cols(hl.is_defined(fam_ht[mt.col_key]))
    logger.info(
        f"Generating family stats using {mt.count_cols()} samples from {len(ped.trios)} trios."
    )

    mt = filter_to_autosomes(mt)
    mt = annotate_adj(mt)
    mt = mt.select_entries('GT', 'GQ', 'AD', 'END', 'adj')
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mt = hl.trio_matrix(mt, pedigree=ped, complete_trios=True)
    trio_adj = (mt.proband_entry.adj & mt.father_entry.adj
                & mt.mother_entry.adj)
    parents_no_alt = (mt.mother_entry.AD[1] == 0) & (mt.father_entry.AD[1]
                                                     == 0)
    parents_high_depth = (mt.mother_entry.AD[0] + mt.mother_entry.AD[1] >
                          20) & (mt.father_entry.AD[0] + mt.father_entry.AD[1]
                                 > 20)
    parents_high_gq = (mt.mother_entry.GQ >= 30) & (mt.father_entry.GQ >= 30)

    ht = mt.select_rows(**generate_trio_stats_expr(
        mt,
        transmitted_strata={
            'raw': None,
            'adj': trio_adj
        },
        de_novo_strata={
            'raw': None,
            'adj': trio_adj,
            'hq': trio_adj & parents_high_gq & parents_high_depth
            & parents_no_alt
        },
        proband_is_female_expr=mt.is_female)).rows()

    return ht.filter(
        ht.n_de_novos_raw + ht.n_transmitted_raw + ht.n_untransmitted_raw > 0)
コード例 #12
0
def filter_ht_for_plink(ht: hl.Table,
                        n_samples: int,
                        min_call_rate: float = 0.95,
                        variants_per_mac_category: int = 2000,
                        variants_per_maf_category: int = 10000):
    from gnomad.utils.filtering import filter_to_autosomes
    ht = filter_to_autosomes(ht)
    ht = ht.filter((ht.call_stats.AN >= n_samples * 2 * min_call_rate)
                   & (ht.call_stats.AC > 0))
    ht = ht.annotate(mac_category=mac_category_case_builder(ht.call_stats))
    category_counter = ht.aggregate(hl.agg.counter(ht.mac_category))
    print(category_counter)
    ht = ht.annotate_globals(category_counter=category_counter)
    return ht.filter(
        hl.rand_unif(
            0, 1) < hl.cond(ht.mac_category >= 1, variants_per_mac_category,
                            variants_per_maf_category) /
        ht.category_counter[ht.mac_category])
コード例 #13
0
def gnomad_sample_qc(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Filter MTs to bi-allelic sites and remove problematic intervals, and performs sample QC
    # TODO: consider reinstating inbreeding coefficient filter

    :param MatrixTable mt: MT on which sample QC metrics need to be computed
    :return: MT filtered to autosomes and high-confidence regions, with computed sample QC column annotations
    :rtype: MatrixTable
    """
    mt = filter_to_autosomes(filter_low_conf_regions(mt))
    mt = mt.filter_rows(hl.len(
        mt.alleles) == 2)  # NOTE: this does not work on a split VDS!
    mt = hl.sample_qc(mt)
    mt = mt.annotate_rows(variant_qc=hl.struct(
        af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2))
    mt = mt.annotate_cols(sample_qc=mt.sample_qc.annotate(
        f_inbreeding=hl.agg.inbreeding(mt.GT, mt.variant_qc.af)))
    return mt
コード例 #14
0
def prepare_mt_for_plink(mt: hl.MatrixTable,
                         n_samples: int,
                         min_call_rate: float = 0.95,
                         variants_per_mac_category: int = 2000,
                         variants_per_maf_category: int = 10000):
    from gnomad.utils.filtering import filter_to_autosomes
    mt = filter_to_autosomes(mt)
    mt = mt.filter_rows((mt.call_stats.AN >= n_samples * 2 * min_call_rate)
                        & (mt.call_stats.AC[1] > 0))
    mt = mt.annotate_rows(
        mac_category=mac_category_case_builder(mt.call_stats))
    category_counter = mt.aggregate_rows(hl.agg.counter(mt.mac_category))
    print(category_counter)
    mt = mt.annotate_globals(category_counter=category_counter)
    return mt.filter_rows(
        hl.rand_unif(
            0, 1) < hl.cond(mt.mac_category >= 1, variants_per_mac_category,
                            variants_per_maf_category) /
        mt.category_counter[mt.mac_category])
コード例 #15
0
def main(args):
    hl.init(log='/platform_pca.log')

    if not args.skip_prepare_data_for_platform_pca:
        # ~1 hour on 800 cores (3/8/18)
        logger.info('Preparing data for platform PCA...')
        mt = get_gnomad_data('exomes', adj=True, raw=False, meta_root=None, fam_root=None, split=False)
        mt = filter_to_autosomes(mt)
        intervals = hl.import_locus_intervals(evaluation_intervals_path)
        mt = mt.annotate_rows(interval=intervals[mt.locus].target)
        mt = mt.filter_rows(hl.is_defined(mt.interval) & (hl.len(mt.alleles) == 2))
        mt = mt.select_entries(GT=hl.or_missing(hl.is_defined(mt.GT), hl.struct()))
        callrate_mt = mt.group_rows_by(mt.interval).aggregate(callrate=hl.agg.fraction(hl.is_defined(mt.GT)))
        callrate_mt.write(exome_callrate_mt_path, args.overwrite)

    if not args.skip_run_platform_pca:
        logger.info('Running platform PCA...')
        qc_ht = hl.read_table(qc_ht_path('exomes', 'hard_filters')).key_by('s')
        callrate_mt = hl.read_matrix_table(exome_callrate_mt_path)
        callrate_mt = callrate_mt.filter_cols(hl.len(qc_ht[callrate_mt.col_key].hard_filters) == 0)
        callrate_mt = callrate_mt.annotate_entries(callrate=hl.int(callrate_mt.callrate > 0.25))
        # Center until Hail's PCA does it for you
        callrate_mt = callrate_mt.annotate_rows(mean_callrate=hl.agg.mean(callrate_mt.callrate))
        callrate_mt = callrate_mt.annotate_entries(callrate=callrate_mt.callrate - callrate_mt.mean_callrate)
        eigenvalues, scores, _ = hl.pca(callrate_mt.callrate, compute_loadings=False)
        logger.info('Eigenvalues: {}'.format(eigenvalues))
        # [731282566.2824697, 78687228.90071851, 43837650.51729764, 33969298.61827205, 26308703.539534636, 21102437.512725923, 16949828.555817757, 12994894.187041137, 8372332.274295175, 8128326.814388647]
        scores.write(exome_callrate_scores_ht_path)

    logger.info('Annotating with platform PCs and known platform annotations...')
    scores = hl.read_table(exome_callrate_scores_ht_path).annotate(data_type='exomes')
    if args.pc_scores_in_separate_fields:
        scores = scores.transmute(scores=[
            scores[ann] for ann in sorted(
                [ann for ann in scores.row if ann.startswith("PC")],
                key=lambda x: int(x[2:])
            )
        ])
    platform_pcs = assign_platform_pcs(scores)
    platform_pcs.write(qc_ht_path('exomes', 'platforms'), overwrite=args.overwrite)
コード例 #16
0
ファイル: sample_qc.py プロジェクト: tpoterba/gnomad_qc
def compute_sample_qc() -> hl.Table:
    logger.info("Computing sample QC")
    mt = filter_to_autosomes(get_gnomad_v3_mt(key_by_locus_and_alleles=True))
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)
    mt = mt.select_entries('GT')

    sample_qc_ht = compute_stratified_sample_qc(
        mt,
        strata={
            'bi_allelic': bi_allelic_expr(mt),
            'multi_allelic': ~bi_allelic_expr(mt)
        },
        tmp_ht_prefix=get_sample_qc().path[:-3],
        gt_expr=mt.GT)

    # Remove annotations that cannot be computed from the sparse format
    sample_qc_ht = sample_qc_ht.annotate(
        **{
            x: sample_qc_ht[x].drop('n_called', 'n_not_called', 'n_filtered',
                                    'call_rate')
            for x in sample_qc_ht.row_value
        })
    return sample_qc_ht.repartition(100)
コード例 #17
0
    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    ################################
    ### DO NOT RUN THIS, this is now part of 3a.annotate_ht_after_RF_lustre.py
    #################################

    run_hash = "ae281191"
    ht = hl.read_table(
        f'{lustre_dir}/variant_qc/models/{run_hash}_megaWES_RF_SYNONYMOUS_denovo_family_stats.ht'
    )

    mt_filtered = hl.read_matrix_table(
        f'{lustre_dir}/variant_qc/MegaWESSanger_cohorts_AC_synonymous_filtered.mt'
    )
    mt_filtered = filter_to_autosomes(mt_filtered)

    mt_trans = mt_filtered.filter_entries(mt_filtered.info.AC[0] == 2)
    mt_untrans = mt_filtered.filter_entries(mt_filtered.info.AC[0] == 1)

    mt_trans_count = mt_trans.group_cols_by(
        mt_trans.id).aggregate(transmitted_singletons_count=hl.agg.count_where(
            (mt_trans.info.AC[0] == 2)
            & (mt_trans.proband_entry.GT.is_het_ref())
            & (mt_trans.father_entry.GT.is_het_ref())
            | (mt_trans.mother_entry.GT.is_het_ref())))

    Total_transmitted_singletons = mt_trans_count.aggregate_entries(
        hl.agg.count_where(mt_trans_count.transmitted_singletons_count == 1))
    print(Total_transmitted_singletons)
    mt_untrans_count = (mt_untrans.group_cols_by(mt_untrans.id).aggregate(
コード例 #18
0
    size=8,
)

html = file_html(p, CDN, "Chart")

with open("1 Mean Call Rate by Mean DP.html", "w") as f:
    f.write(html)

# Filter by call rate
mt = mt.filter_cols(mt.sample_qc.call_rate >= CALL_RATE)

# Filter by read depth (DP)
mt = mt.filter_cols(mt.sample_qc.dp_stats.mean >= READ_DEPTH)

# Preparing for PCA
for_pca = filter_to_autosomes(mt)
for_pca = for_pca.filter_rows(for_pca.n_alleles == 2)

# Performing the PCA
sample_num = for_pca.cols().count()

_, scores, _ = hl.hwe_normalized_pca(
    for_pca.GT, k=max(1, min(sample_num // 3, 10)), compute_loadings=False
)

relatedness_ht = hl.pc_relate(
    for_pca.GT,
    min_individual_maf=0.01,
    scores_expr=scores[for_pca.col_key].scores,
    block_size=4096,
    min_kinship=0.05,