コード例 #1
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            'Writing qc-filtered MT filtered to external maf with to disk...')
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Burden Test ######

    logger.info('Running burden test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt.annotate_cols(**tb_sample[mt.s]))

    mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control']))

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # print('Number of samples/variants: ')
    # print(mt.count())

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(mt.GT.is_het()) >= 2,
        homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) |
        (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist())
    mts = []

    if args.homs:
        # select homs genotypes.

        mt_homs = (mt_grouped.select_entries(
            mac=mt_grouped.homs).annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # select compound hets (chets) genotypes.
        mt_chets = (mt_grouped.select_entries(
            mac=mt_grouped.chets).annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # select chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.select_entries(
            mac=mt_grouped.homs_chets).annotate_rows(
                agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # select hets genotypes
        mt_hets = (mt_grouped.select_entries(
            mac=mt_grouped.hets).annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_grouped = hl.MatrixTable.union_rows(*mts)

    # Generate table of counts
    tb_gene = (mt_grouped.annotate_rows(
        n_cases=hl.agg.filter(mt_grouped['phe.is_case'],
                              hl.agg.sum(mt_grouped.mac)),
        n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                  hl.agg.sum(mt_grouped.mac)),
        n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                     hl.agg.sum(mt_grouped.mac)),
        n_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                 hl.agg.sum(mt_grouped.mac)),
        n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()),
        n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                        hl.agg.count()),
        n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                           hl.agg.count()),
        n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                       hl.agg.count())).rows())

    # run fet stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []
    for proband in analysis:
        logger.info(f'Running test for {proband}...')
        colCases = None
        colTotalCases = None
        colControls = 'n_controls'
        colTotalControls = 'n_total_controls'
        if proband == 'all_cases':
            colCases = 'n_cases'
            colTotalCases = 'n_total_cases'
        if proband == 'syndromic':
            colCases = 'n_syndromic'
            colTotalCases = 'n_total_syndromic'
        if proband == 'nonsyndromic':
            colCases = 'n_nonsyndromic'
            colTotalCases = 'n_total_nonsyndromic'

        tb_fet = compute_fisher_exact(tb=tb_gene,
                                      n_cases_col=colCases,
                                      n_control_col=colControls,
                                      total_cases_col=colTotalCases,
                                      total_controls_col=colTotalControls,
                                      correct_total_counts=True,
                                      root_col_name='fet',
                                      extra_fields={
                                          'analysis': proband,
                                          'maf': maf_cutoff
                                      })

        # filter out zero-count genes
        tb_fet = (tb_fet.filter(
            hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True))

        tbs.append(tb_fet)

    tb_final = hl.Table.union(*tbs)

    tb_final.describe()

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()
コード例 #2
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

        # Read MT from 1kgenome and keep only locus defined in interval
        mt_1kg = get_1kg_mt(args.default_reference)

        # Joining dataset (inner join). Keep only 'GT' entry field
        mt_joint = (mt.select_entries('GT').union_cols(
            mt_1kg.select_entries('GT'), row_join_type='inner'))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        mt_joint = (mt_joint.filter_rows(
            bi_allelic_expr(mt_joint)
            & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1])
            & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)).
                    naive_coalesce(1000))

        logger.info(
            "Checkpoint: writing joint filtered MT before LD pruning...")
        mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='joint_1kg_high_callrate_common_snp_biallelic'),
                                       overwrite=True)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt_joint.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt_joint = (mt_joint.filter_rows(
            hl.is_defined(pruned_variant_table[mt_joint.row_key])))

        logger.info("Writing filtered joint MT with variants in LD pruned...")
        (mt_joint.write(get_qc_mt_path(
            dataset=args.exome_cohort + '_1kg',
            part='joint_high_callrate_common_snp_biallelic',
            split=True,
            ld_pruned=True),
                        overwrite=args.overwrite))

    logger.info("Importing filtered joint MT...")
    mt_joint = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort + '_1kg',
                       part='joint_high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA with {mt_joint.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT,
                                                      k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")  # TODO: save eigenvalues?

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='joint_pca_1kg')
    pca_table.write(output=output_ht_path)

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("Done!")
コード例 #3
0
ファイル: pca.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=args.exome_cohort,
                           part='unphase_adj_genotypes',
                           split=True))

        # filter to samples passing QC filters
        logger.info(
            "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..."
        )
        sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc'))
        sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters))
        mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key])))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        maf = args.maf_threshold
        mt = (mt.filter_rows(
            bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1])
            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf)
            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce(
                500))

        logger.info("Checkpoint: writing filtered MT before LD pruning...")
        mt = mt.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='high_callrate_common_snp_biallelic'),
                           overwrite=args.overwrite)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key])))

        logger.info("Writing filtered MT with ld-pruned variants...")
        (mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                                 part='high_callrate_common_snp_biallelic',
                                 split=True,
                                 ld_pruned=True),
                  overwrite=args.overwrite))

    logger.info("Importing filtered ld-pruned MT...")
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA on {mt.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")

    # Annotate eigenvalues as global field
    pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues}))

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = args.output_ht
    pca_table = (pca_table.checkpoint(output=output_ht_path,
                                      overwrite=args.overwrite))

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("PCA pipeline finalised...")
コード例 #4
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import adj genotype MT and remove
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='sample_qc_adj_genotypes',
                       split=True))

    # keep samples passing QC filtering
    mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows())

    # import variant info fields (vcf info)
    variant_info_ht = (get_vep_annotation_ht().drop('vep'))

    # Add useful annotation for variant hard filter
    ht = (
        mt.annotate_rows(
            inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff,
            vqsr_filter=variant_info_ht[mt.row_key].filters,
            VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD,
            gt_counts=hl.agg.count_where(hl.is_defined(
                mt.GT))  # expected MT filtered to high-quality GT
        ).rows())

    # 1. Apply variant hard filters
    # hard filter expression
    variant_hard_filter_expr = {
        'fail_inbreeding_coeff':
        ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF,
        'AC0': ht.gt_counts == 0
    }

    ht = (ht.annotate(**variant_hard_filter_expr))

    # 2. Apply VQSR filter
    ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0))

    # 3. Apply RF filter

    # import/parse rf final HT
    ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result'))

    ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'],
                          variant_type=ht_rf.variant_type))

    ht = (ht.annotate(**ht_rf[ht.key]))

    ht = (ht.annotate(fail_rf=hl.case().when(
        (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF)
        & (ht.variant_type == 'snv'), True).when(
            (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF)
            & (ht.variant_type == 'indel'), True).default(False)))

    # 5. Apply coverage/capture interval filters

    ## gnomad genome coverage
    gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by()
    gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus(
        gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus'))
    ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10))
    ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9))

    ## defined in capture intervals

    # filter to capture intervals (intersect)
    ht_defined_intervals = filter_capture_intervals(ht)
    ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined(
        ht_defined_intervals[ht.key])))

    # 6. Summary final variant QC

    # final variant qc filter joint expression
    final_variant_qc_ann_expr = {
        'pass_variant_qc_filters':
        hl.cond(
            ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf
            & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals,
            True, False)
    }
    ht = (ht.annotate(**final_variant_qc_ann_expr))

    # Counts the number of variants (snv and indels) affected by every filter and add as global field
    filter_flags = [
        'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf',
        'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals',
        'pass_variant_qc_filters'
    ]

    summary_filter_expr = {
        v: hl.struct(
            **{
                f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f]))
                for f in filter_flags
            })
        for v in ['snv', 'indel']
    }

    ht = ht.annotate_globals(
        summary_filter=ht.aggregate(summary_filter_expr, _localize=False))

    # write HT variant QC final table
    output_path = get_variant_qc_ht_path(dataset=args.exome_cohort,
                                         part='final_qc')
    ht = ht.checkpoint(output_path, overwrite=args.overwrite)

    # print filter summary
    logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}')

    # export HT to file
    if args.write_to_file:
        ht.export(f'{output_path}.tsv.bgz')

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #5
0
ファイル: mt_query.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):
    ## Init Hail
    hl.init(default_reference=args.default_ref_genome)

    ## Import unfiltered MT with adjusted genotypes
    ds = args.exome_cohort
    mt = hl.read_matrix_table(get_qc_mt_path(dataset=ds,
                                             part='unphase_adj_genotypes',
                                             split=True))

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt
          .annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                         Consequence=vep_ht[mt.row_key].vep.Consequence,
                         DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                         SYMBOL=vep_ht[mt.row_key].vep.SYMBOL)
          )

    ## Parse geneset
    geneset = parse_geneset(args.geneset_file)

    ## Filter to geneset
    mt = (mt
          .filter_rows(hl.set(geneset).contains(mt.SYMBOL))
          .checkpoint(f'{nfs_tmp}/tmp.mt',
                      overwrite=True)
          )

    ## Sample-QC filtering
    if args.apply_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info('Writing sample qc-filtered MT to disk...')
        mt = (mt
              .checkpoint(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                          overwrite=True)
              )

    ## Variant-QC filtering
    if args.apply_variant_qc_filtering:
        logger.info('Applying per variant QC filtering...')

        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info('Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...')
        mt = (mt
              .checkpoint(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                          overwrite=True)
              )

    ## Filtering by AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if args.apply_af_filtering:
        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt
              .annotate_rows(**af_ht[mt.row_key])
              )

        filter_expressions = [af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
                              af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
                              af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
                              af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
                              af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
                              af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
                              ]

        mt = (mt
              .filter_rows(functools.reduce(operator.iand, filter_expressions), keep=True)
              )

        logger.info('Writing AF-filtered MT to disk...')
        mt = (mt
              .checkpoint(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                          overwrite=True)
              )

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Generate blind sample IDs
    mt = mt.add_col_index()

    mt = (mt
          .annotate_cols(BIID=hl.str('BLIND_ID_') + hl.str(mt.col_idx))
          )

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt
          .annotate_cols(**tb_sample[mt.s])
          )

    mt = (mt
          .filter_cols(mt['phe.is_case'] | mt['phe.is_control'])
          )

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Annotate variants ID
    mt = annotate_variant_id(mt)

    # annotate samples
    ann_expr = {'n_het_cases': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'], hl.agg.count()),
                'n_hom_cases': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'], hl.agg.count()),
                'n_het_syndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_syndromic'], hl.agg.count()),
                'n_hom_syndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_syndromic'], hl.agg.count()),
                'n_het_nonsyndromic': hl.agg.filter(mt.GT.is_het() & mt['phe.is_nonsyndromic'], hl.agg.count()),
                'n_hom_nonsyndromic': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_nonsyndromic'], hl.agg.count()),
                'n_het_controls': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'], hl.agg.count()),
                'n_hom_controls': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'], hl.agg.count()),
                'het_case_ids': hl.agg.filter(mt.GT.is_het() & mt['phe.is_case'],
                                              hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')),
                'hom_case_ids': hl.agg.filter(mt.GT.is_hom_var() & mt['phe.is_case'],
                                              hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')),
                'het_control_ids': hl.agg.filter(mt.GT.is_het() & ~mt['phe.is_case'],
                                                 hl.delimit(hl.agg.collect_as_set(mt.BIID), '|')),
                'hom_control_ids': hl.agg.filter(mt.GT.is_hom_var() & ~mt['phe.is_case'],
                                                 hl.delimit(hl.agg.collect_as_set(mt.BIID), '|'))
                }
    ht = (mt
          .annotate_rows(**ann_expr)
          .rows()
          .key_by()
          .select(*list(['vid', 'Consequence', 'SYMBOL', 'internal_af', 'gnomAD_AF', 'vep.MVP_score', 'vep.REVEL_score',
                         'vep.MPC_score', 'vep.CADD_PHRED']) + list(ann_expr.keys()))
          )

    # export results
    (ht
     .export(args.output_file)
     )
コード例 #6
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import raw split MT
    mt = (get_mt_data(dataset=args.exome_cohort, part='raw',
                      split=True).select_cols())

    ht = (mt.cols().key_by('s'))

    # Annotate samples filters
    sample_qc_filters = {}

    # 1. Add sample hard filters annotation expr
    sample_qc_hard_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters'))

    sample_qc_filters.update(
        {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']})

    # 2. Add population qc filters annotation expr
    sample_qc_pop_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc'))

    sample_qc_filters.update(
        {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']})

    # 3. Add relatedness filters annotation expr
    related_samples_to_drop = get_related_samples_to_drop()
    related_samples = hl.set(
        related_samples_to_drop.aggregate(
            hl.agg.collect_as_set(related_samples_to_drop.node.id)))

    sample_qc_filters.update({'is_related': related_samples.contains(ht.s)})

    # 4. Add stratified sample qc (population/platform) annotation expr
    sample_qc_pop_platform_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort,
                              part='stratified_metrics_filter'))

    sample_qc_filters.update({
        'pop_platform_filters':
        sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters']
    })

    ht = (ht.annotate(**sample_qc_filters))

    # Final sample qc filter joint expression
    final_sample_qc_ann_expr = {
        'pass_filters':
        hl.cond((hl.len(ht.hard_filters) == 0) &
                (hl.len(ht.pop_platform_filters) == 0) &
                (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False)
    }
    ht = (ht.annotate(**final_sample_qc_ann_expr))

    logger.info('Writing final sample qc HT to disk...')
    output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='final_qc')

    ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite)

    # Export final sample QC annotations to file
    if args.write_to_file:
        (ht.export(f'{output_path_ht}.tsv.bgz'))

    ## Release final unphase MT with adjusted genotypes filtered
    mt = unphase_mt(mt)
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj')

    logger.info('Writing unphase MT with adjusted genotypes to disk...')
    # write MT
    mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                            part='unphase_adj_genotypes',
                            split=True),
             overwrite=args.overwrite)

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #7
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Run gene-set burden logistic regression ######

    logger.info('Running gene-set burden logistic regression test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group and gene clusters before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # First-step aggregation:
    # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow:
    #
    #   a) entry: hets
    #   b) entry: homs
    #   c) entry: chets (compound hets)

    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(
            mt.GT.is_het()) >= 2).repartition(100).persist())

    # Import/generate gene clusters
    clusters = hl.import_table(args.set_file,
                               no_header=True,
                               delimiter="\t",
                               min_partitions=50,
                               impute=False)
    clusters = generate_clusters_map(clusters)

    # Annotate gene-set info
    mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL]))

    # Explode nested csq_group before grouping
    mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id))

    # filter rows with defined consequence and gene-set name
    mt_grouped = (mt_grouped.filter_rows(
        hl.is_defined(mt_grouped.csq_group)
        & hl.is_defined(mt_grouped.cluster_id)))

    # 2. Second-step aggregation
    # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow:
    # if dominant -> sum hets (default)
    # if recessive -> sum (homs)
    # if recessive (a) -> sum (chets)
    # if recessive (b) -> sum (chets and/or homs)

    mts = []

    if args.homs:
        # Group mt by gene-sets/csq_group aggregating homs genotypes.
        mt_homs = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition(
                    100).persist().annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes.
        mt_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition(
                    100).persist().annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int(
                hl.agg.count_where(mt_grouped.chets
                                   | mt_grouped.homs))).repartition(100).
                         persist().annotate_rows(agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # Group mt by gene-sets/csq_group aggregating hets genotypes (default)
        mt_hets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition(
                    100).persist().annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_joint = hl.MatrixTable.union_rows(*mts)

    ## Add samples annotations
    # annotate sample covs
    covariates = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht')
    mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s]))

    # annotate case/control phenotype info
    tb_sample = get_sample_meta_data()
    mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s]))

    mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case']
                                     | mt_joint['phe.is_control']))

    ## Run logistic regression stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []

    covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']

    for proband in analysis:
        logger.info(f'Running burden test for {proband}...')

        mt_tmp = hl.MatrixTable

        if proband == 'all_cases':
            mt_tmp = mt_joint
        if proband == 'syndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic'])
        if proband == 'nonsyndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic'])

        tb_logreg = logistic_regression(mt=mt_tmp,
                                        x_expr='mac',
                                        response='phe.is_case',
                                        covs=covs,
                                        pass_through=['agg_genotype'],
                                        extra_fields={
                                            'analysis': proband,
                                            'maf': maf_cutoff,
                                            'covs': '|'.join(covs)
                                        })

        tbs.append(tb_logreg)

    tb_final = hl.Table.union(*tbs)

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()