コード例 #1
0
ファイル: test_impex.py プロジェクト: lfrancioli/hail
 def test_import_fam(self):
     fam_file = resource('sample.fam')
     nfam = hl.import_fam(fam_file).count()
     i = 0
     with open(fam_file) as f:
         for line in f:
             if len(line.strip()) != 0:
                 i += 1
     self.assertEqual(nfam, i)
コード例 #2
0
ファイル: test_family_methods.py プロジェクト: jigold/hail
    def test_trio_matrix_null_keys(self):
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        # Make keys all null
        mt = mt.key_cols_by(s=hl.null(hl.tstr))

        tt = hl.trio_matrix(mt, ped, complete_trios=True)
        self.assertEqual(tt.count_cols(), 0)
コード例 #3
0
    def test_trio_matrix(self):
        """
        This test depends on certain properties of the trio matrix VCF and
        pedigree structure. This test is NOT a valid test if the pedigree
        includes quads: the trio_matrix method will duplicate the parents
        appropriately, but the genotypes_table and samples_table orthogonal
        paths would require another duplication/explode that we haven't written.
        """
        ped = hl.Pedigree.read(resource('triomatrix.fam'))
        ht = hl.import_fam(resource('triomatrix.fam'))

        mt = hl.import_vcf(resource('triomatrix.vcf'))
        mt = mt.annotate_cols(fam=ht[mt.s].fam_id)

        dads = ht.filter(hl.is_defined(ht.pat_id))
        dads = dads.select(dads.pat_id, is_dad=True).key_by('pat_id')

        moms = ht.filter(hl.is_defined(ht.mat_id))
        moms = moms.select(moms.mat_id, is_mom=True).key_by('mat_id')

        et = (mt.entries()
              .key_by('s')
              .join(dads, how='left')
              .join(moms, how='left'))
        et = et.annotate(is_dad=hl.is_defined(et.is_dad),
                         is_mom=hl.is_defined(et.is_mom))

        et = (et
            .group_by(et.locus, et.alleles, fam=et.fam)
            .aggregate(data=hl.agg.collect(hl.struct(
            role=hl.case().when(et.is_dad, 1).when(et.is_mom, 2).default(0),
            g=hl.struct(GT=et.GT, AD=et.AD, DP=et.DP, GQ=et.GQ, PL=et.PL)))))

        et = et.filter(hl.len(et.data) == 3)
        et = et.select('data').explode('data')

        tt = hl.trio_matrix(mt, ped, complete_trios=True).entries().key_by('locus', 'alleles')
        tt = tt.annotate(fam=tt.proband.fam,
                         data=[hl.struct(role=0, g=tt.proband_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=1, g=tt.father_entry.select('GT', 'AD', 'DP', 'GQ', 'PL')),
                               hl.struct(role=2, g=tt.mother_entry.select('GT', 'AD', 'DP', 'GQ', 'PL'))])
        tt = tt.select('fam', 'data').explode('data')
        tt = tt.filter(hl.is_defined(tt.data.g)).key_by('locus', 'alleles', 'fam')

        self.assertEqual(et.key.dtype, tt.key.dtype)
        self.assertEqual(et.row.dtype, tt.row.dtype)
        self.assertTrue(et._same(tt))

        # test annotations
        e_cols = (mt.cols()
                  .join(dads, how='left')
                  .join(moms, how='left'))
        e_cols = e_cols.annotate(is_dad=hl.is_defined(e_cols.is_dad),
                                 is_mom=hl.is_defined(e_cols.is_mom))
        e_cols = (e_cols.group_by(fam=e_cols.fam)
                  .aggregate(data=hl.agg.collect(hl.struct(role=hl.case()
                                                           .when(e_cols.is_dad, 1).when(e_cols.is_mom, 2).default(0),
                                                           sa=hl.struct(**e_cols.row.select(*mt.col))))))
        e_cols = e_cols.filter(hl.len(e_cols.data) == 3).select('data').explode('data')

        t_cols = hl.trio_matrix(mt, ped, complete_trios=True).cols()
        t_cols = t_cols.annotate(fam=t_cols.proband.fam,
                                 data=[
                                     hl.struct(role=0, sa=t_cols.proband),
                                     hl.struct(role=1, sa=t_cols.father),
                                     hl.struct(role=2, sa=t_cols.mother)]).key_by('fam').select('data').explode('data')
        t_cols = t_cols.filter(hl.is_defined(t_cols.data.sa))

        self.assertEqual(e_cols.key.dtype, t_cols.key.dtype)
        self.assertEqual(e_cols.row.dtype, t_cols.row.dtype)
        self.assertTrue(e_cols._same(t_cols))
コード例 #4
0
def get_gnomad_data(data_type: str,
                    adj: bool = False,
                    split: bool = True,
                    raw: bool = False,
                    non_refs_only: bool = False,
                    hail_version: str = CURRENT_HAIL_VERSION,
                    meta_version: str = None,
                    meta_root: Optional[str] = 'meta',
                    full_meta: bool = False,
                    fam_version: str = CURRENT_FAM,
                    fam_root: str = None,
                    duplicate_mapping_root: str = None,
                    release_samples: bool = False,
                    release_annotations: bool = None) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered)
    :param str data_type: One of `exomes` or `genomes`
    :param bool adj: Whether the returned data should be filtered to adj genotypes
    :param bool split: Whether the dataset should be split (only applies to raw=False)
    :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes)
    :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes)
    :param str hail_version: One of the HAIL_VERSIONs
    :param str meta_version: Version of metadata (None for current)
    :param str meta_root: Where to put metadata. Set to None if no metadata is desired.
    :param str full_meta: Whether to add all metadata (warning: large)
    :param str fam_version: Version of metadata (default to current)
    :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired.
    :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate)
    :param bool release_samples: When set, filters the data to release samples only
    :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data
    :return: gnomAD hardcalls dataset with chosen annotations
    :rtype: MatrixTable
    """
    #from gnomad_hail.utils import filter_to_adj

    if raw and split:
        raise DataException(
            'No split raw data. Use of hardcalls is recommended.')

    if non_refs_only:
        mt = hl.read_matrix_table(
            get_gnomad_data_path(data_type,
                                 split=split,
                                 non_refs_only=non_refs_only,
                                 hail_version=hail_version))
    else:
        mt = hl.read_matrix_table(
            get_gnomad_data_path(data_type,
                                 hardcalls=not raw,
                                 split=split,
                                 hail_version=hail_version))

    if adj:
        mt = filter_to_adj(mt)

    if meta_root:
        meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta)
        mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]})

    if duplicate_mapping_root:
        dup_ht = hl.import_table(
            genomes_exomes_duplicate_ids_tsv_path,
            impute=True,
            key='exome_id' if data_type == "exomes" else 'genome_id')
        mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]})

    if fam_root:
        fam_ht = hl.import_fam(fam_path(data_type, fam_version))
        mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]})

    if release_samples:
        mt = mt.filter_cols(mt.meta.release)

    if release_annotations:
        sites_ht = get_gnomad_public_data(data_type, split)
        mt = mt.select_rows(**sites_ht[mt.row_key])

    mt = mt.select_globals(
    )  # Required since a backward-incompatible change in Hail

    return mt
コード例 #5
0
def main(args):
    global output_prefix
    output_prefix = args.output_dir.rstrip("/") + "/" + splitext(
        basename(args.input_mt))[0]

    if args.compute_qc_mt:
        qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt))
        qc_mt = qc_mt.repartition(n_partitions=200)
        qc_mt.write(path('qc.mt'), overwrite=args.overwrite)

    if args.compute_qc_metrics:
        logger.info("Computing sample QC")
        mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt))
        strats = {
            'bi_allelic': bi_allelic_expr(mt),
            'multi_allelic': ~bi_allelic_expr(mt)
        }
        for strat, filter_expr in strats.items():
            strat_sample_qc_ht = hl.sample_qc(
                mt.filter_rows(filter_expr)).cols()
            strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'),
                                     overwrite=args.overwrite)
        strat_hts = [
            hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats
        ]
        sample_qc_ht = strat_hts.pop()
        sample_qc_ht = sample_qc_ht.select(
            sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [
                strat_hts[i][sample_qc_ht.key].sample_qc
                for i in range(0, len(strat_hts))
            ]))
        sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite)

    if args.compute_callrate_mt:
        callrate_mt = compute_callrate_mt(
            hl.read_matrix_table(args.input_mt),
            hl.import_locus_intervals(exome_calling_intervals_path))
        callrate_mt.write(path('callrate.mt'), args.overwrite)

    if args.run_platform_pca:
        eigenvalues, scores_ht, loadings_ht = run_platform_pca(
            hl.read_matrix_table(path('callrate.mt')))
        scores_ht.write(path('platform_pca_scores.ht'),
                        overwrite=args.overwrite)
        loadings_ht.write(path('platform_pca_loadings.ht'),
                          overwrite=args.overwrite)

    if args.assign_platforms:
        platform_ht = assign_platform_from_pcs(
            hl.read_table(path('platform_pca_scores.ht')),
            hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
            hdbscan_min_samples=args.hdbscan_min_samples)
        platform_ht.write(f'{output_prefix}.platform_pca_results.ht',
                          overwrite=args.overwrite)

    if args.impute_sex:
        sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')),
                           hl.read_matrix_table(args.input_mt),
                           hl.read_table(path('platform_pca_results.ht')),
                           args.male_threshold, args.female_threshold,
                           args.min_male_y_sites_called,
                           args.max_y_female_call_rate,
                           args.min_y_male_call_rate)
        sex_ht.write(path('sex.ht'), overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PCA for PC-Relate')
        qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries()
        eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(path('pruned.pca_scores.ht'), args.overwrite)

        logger.info('Running PC-Relate')
        logger.warn(
            "PC-relate requires SSDs and doesn't work with preemptible workers!"
        )
        scores = hl.read_table(path('pruned.pca_scores.ht'))
        relatedness_ht = hl.pc_relate(qc_mt.GT,
                                      min_individual_maf=0.05,
                                      scores_expr=scores[qc_mt.col_key].scores,
                                      block_size=4096,
                                      min_kinship=args.min_emission_kinship,
                                      statistics='all')
        relatedness_ht.write(path('relatedness.ht'), args.overwrite)

    if args.filter_dups:
        logger.info("Filtering duplicate samples")
        sample_qc_ht = hl.read_table(path('sample_qc.ht'))
        samples_rankings_ht = sample_qc_ht.select(
            rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean)
        dups_ht = filter_duplicate_samples(
            hl.read_table(path('relatedness.ht')), samples_rankings_ht)
        dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite)

    if args.infer_families:
        logger.info("Inferring families")
        duplicates_ht = hl.read_table(path('duplicates.ht'))
        dups_to_remove = duplicates_ht.aggregate(
            hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s),
                           duplicates_ht.filtered))
        ped = infer_families(hl.read_table(path('relatedness.ht')),
                             hl.read_table(path('sex.ht')), dups_to_remove)
        ped.write(path('pedigree.ped'))

    if args.filter_related_samples:
        logger.info("Filtering related samples")
        related_pairs_ht, related_pairs_tie_breaker = rank_related_samples(
            hl.read_table(path('relatedness.ht')), hl.read_table(args.meta),
            hl.read_table(path('sample_qc.ht')),
            hl.import_fam(path('pedigree.ped'), delimiter="\t"))

        related_samples_to_drop_ht = hl.maximal_independent_set(
            related_pairs_ht.i,
            related_pairs_ht.j,
            keep=False,
            tie_breaker=related_pairs_tie_breaker)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by()
        related_samples_to_drop_ht = related_samples_to_drop_ht.select(
            **related_samples_to_drop_ht.node)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s')
        related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'),
                                         overwrite=args.overwrite)

    if args.run_pca:
        logger.info("Running population PCA")
        pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds(
            hl.read_matrix_table(path('qc.mt')),
            hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs)
        pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite)
        pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite)

    if args.assign_pops:
        logger.info("Assigning global population labels")
        pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht"))
        gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[
            pop_pca_scores_ht.key]
        pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing(
            gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop))
        pop_ht, pops_rf_model = assign_population_pcs(
            pop_pca_scores_ht,
            pc_cols=pop_pca_scores_ht.scores[:args.n_pcs],
            known_col='known_pop',
            min_prob=args.min_pop_prob)

        pop_ht.write(path('pop.ht'), args.overwrite)
        with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(pops_rf_model, out)

    if args.assign_subpops:
        qc_mt = hl.read_matrix_table(path('qc.mt'))
        pop_ht = hl.read_table(path('pop.ht'))
        meta_ht = hl.read_table(args.meta)[qc_mt.col_key]
        qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop,
                                    is_case=meta_ht.is_case,
                                    country=meta_ht.country)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01)
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        qc_mt = hl.filter_intervals(qc_mt,
                                    platform_specific_intervals,
                                    keep=False)

        assign_and_write_subpops(
            qc_mt,
            hl.read_table(path('related_samples_to_drop.ht')),
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_pcs,
            min_pop_prob=args.min_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='country',
            include_in_pop_count=qc_mt.is_case)

    if args.run_kgp_pca:
        logger.info("Joining data with 1000 Genomes")
        qc_mt = hl.read_matrix_table(
            path('qc.mt')).select_rows().select_entries("GT")
        qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr),
                                  known_subpop=hl.null(hl.tstr))
        qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key)

        kgp_mt = hl.read_matrix_table(
            kgp_phase3_genotypes_mt_path()).select_rows()
        kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get(
            kgp_mt.population, "oth").lower(),
                                    known_subpop=kgp_mt.population.lower())
        kgp_mt = kgp_mt.filter_rows(hl.is_defined(
            qc_mt.rows()[kgp_mt.row_key]))
        kgp_mt = filter_rows_for_qc(kgp_mt)
        kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key)

        union_kgp_qc_mt = qc_mt.union_cols(kgp_mt)
        union_kgp_qc_mt.write(path('union_kgp_qc.mt'),
                              overwrite=args.overwrite)

        logger.info("Computing PCA on data with 1000 Genomes")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)
        pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds(
            union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs)
        union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'),
                                        args.overwrite)
        union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'),
                                      args.overwrite)

    if args.assign_pops_kgp:
        logger.info("Assigning populations based on 1000 Genomes labels")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        union_kgp_pca_scores_ht = hl.read_table(
            path('union_kgp_pca_scores.ht'))
        union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate(
            known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop)
        union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs(
            union_kgp_pca_scores_ht,
            pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs],
            known_col='known_pop',
            min_prob=args.min_kgp_pop_prob)

        union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite)

        with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(union_kgp_pop_rf_model, out)

    if args.assign_subpops_kgp:
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        meta_ht = hl.read_table(args.meta)
        union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht'))
        union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols(
            is_case=meta_ht[union_kgp_qc_mt.col_key].is_case,
            pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')))
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt,
                                              platform_specific_intervals,
                                              keep=False)

        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)

        assign_and_write_subpops(
            union_kgp_qc_mt,
            related_samples_to_drop_ht,
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_kgp_pcs,
            min_pop_prob=args.min_kgp_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='known_subpop',
            include_in_pop_count=union_kgp_qc_mt.is_case,
            files_prefix='union_kgp_')

    if args.apply_stratified_filters:
        logger.info("Computing stratified QC")
        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            pop_ht = hl.read_table(path('pops.ht'))
            platform_ht = hl.read_table(path('platform_pca_results.ht'))
            sample_qc_ht = sample_qc_ht.annotate(
                qc_pop=pop_ht[sample_qc_ht.key].pop,
                qc_platform=platform_ht[sample_qc_ht.key].qc_platform)
            stratified_metrics_ht = compute_stratified_metrics_filter(
                sample_qc_ht, args.filtering_qc_metrics.split(","),
                ['qc_pop', 'qc_platform'])
            stratified_metrics_ht.write(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'),
                overwrite=args.overwrite)

    if args.write_full_meta:
        logger.info("Writing metadata table")

        # List all tables to join with the base meta
        meta_annotation_hts = [
            hl.read_table(path('platform_pca_results.ht')).rename(
                {'scores': 'platform_pc_scores'}),
            hl.read_table(path('sex.ht')),
            flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))),
            hl.read_table(path('related_samples_to_drop.ht')).select(
                related_filtered=True),
            hl.read_table(path('pca_scores.ht')).rename(
                {'scores': 'pop_pc_scores'}),
            hl.read_table(path('pops.ht')).select('pop'),
            hl.read_table(path('nfe.pca_scores.ht')).rename(
                {'scores': 'nfe_pc_scores'}),
            hl.read_table(path('subpops.nfe.ht')).select('subpop')
        ]

        # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht'))
        # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s')
        # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop)
        # meta_annotation_hts.append(union_kgp_pops_ht)
        #
        # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'})
        # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s')
        # meta_annotation_hts.append(union_kgp_pca_scores_ht)

        gnomad_meta_ht = get_gnomad_meta('exomes')
        gnomad_meta_ht = gnomad_meta_ht.select(
            gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop)
        meta_annotation_hts.append(gnomad_meta_ht)

        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            stratified_metrics_filters_ht = hl.read_table(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'))
            if variant_class_prefix:
                sample_qc_ht = sample_qc_ht.rename(
                    {'sample_qc': f'{variant_class_prefix}sample_qc'})
                stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename(
                    {
                        f: f'{variant_class_prefix}{f}'
                        for f in list(stratified_metrics_filters_ht.globals) +
                        list(stratified_metrics_filters_ht.row_value)
                    })
            meta_annotation_hts.extend(
                [sample_qc_ht, stratified_metrics_filters_ht])

        meta_ht = hl.read_table(args.meta)
        meta_ht = meta_ht.annotate_globals(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht.index_globals().items()
            })

        meta_ht = meta_ht.annotate(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht[meta_ht.key].items()
            })

        filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_"
        meta_ht = meta_ht.annotate_globals(
            filtering_variant_class=args.filtering_variant_class)
        meta_ht = meta_ht.annotate(sample_filters=add_filters_expr(
            filters={
                "ambiguous sex": hl.is_missing(meta_ht.is_female),
                'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate,
                'duplicate': hl.is_defined(meta_ht.dup_filtered)
                & meta_ht.dup_filtered,
                'related': meta_ht.related_filtered
            },
            current_filters=meta_ht[
                f'{filtering_col_prefix}pop_platform_filters']))

        meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
コード例 #6
0

def pbt_phased_trios_mt_path(data_type: str,
                             split: bool = True,
                             hail_version: str = CURRENT_HAIL_VERSION):
    return "gs://gnomad/hardcalls/hail-{0}/mt/{1}/gnomad.{1}.trios.pbt_phased{2}.mt".format(
        hail_version, data_type, "" if split else ".unsplit")


exomes = hl.read_matrix_table(pbt_phased_trios_mt_path("exomes"))
exomes = exomes.filter_cols(exomes.s == exomes.source_trio.proband.s)
df = phase_sensitivity_fast(
    exomes, windowsize=100)  #should be dealable, for a single individual
print("per indv exome done" + tm.ctime())
df["categ"] = df.index
hl.Table.from_pandas(df).export(
    "gs://gnomad-qingbowang/MNV/phase_sensitivity_exome_proband_w100.tsv")

genomes = hl.read_matrix_table(pbt_phased_trios_mt_path("genomes"))
fam_ht = hl.import_fam(fam_path("genomes"),
                       delimiter="\t")  #for genomes, we need to annotate this
genomes = genomes.annotate_cols(source_trio=fam_ht[genomes.s])
genomes = genomes.filter_cols(
    hl.len(genomes.source_trio.fam_id) > 0)  #filtering to child only
df = phase_sensitivity_fast(
    genomes, windowsize=100)  #should be dealable, for a single individual
print("per indv genome done" + tm.ctime())
df["categ"] = df.index
hl.Table.from_pandas(df).export(
    "gs://gnomad-qingbowang/MNV/phase_sensitivity_genome_proband_w100.tsv")