Beispiel #1
0
def generate_interval_list_ht(genome_ref: str = 'GRCh38') -> hl.Table:
    """
    Generate a list of intervals (union)

    :return: A joint table (union) of intervals
    """

    intervals = [
        get_ssv2_intervals_ht(),
        get_ssv3_intervals_ht(),
        get_ssv4_intervals_ht(),
        get_ssv5_intervals_ht(),
        get_idt_xgen_intervals_ht()
    ]

    # get global annotation(s) from input tables
    sources = [t.source.collect()[0] for t in intervals]
    platform_labels = [t.platform_label.collect()[0] for t in intervals]

    global_ann_expr = dict(
        zip(GLOBAL_ANNOTATION_FIELDS,
            (current_date(), sources, genome_ref, platform_labels)))

    # keep only the interval <key> field for all tables
    intervals = [ht.key_by('interval').select() for ht in intervals]

    ht_interval = (hl.Table.union(*intervals).select_globals())

    ht_interval = ht_interval.annotate_globals(**global_ann_expr)

    assert ht_interval.key.interval.dtype == hl.dtype(
        f'interval<locus<{genome_ref}>>')

    return ht_interval
Beispiel #2
0
def import_intervals_from_bed(bed_path: str, platform_label: str,
                              genome_ref: str) -> hl.Table:
    """
    Handle importing BED files as intervals. Recode contig if necessary and
    annotate global meta-info.
    Note: `platform_label` and `genome_ref` are required, since these info
           will be used as global annotations.

    :param bed_path: Path to capture interval BED file
    :param platform_label: Unique capture interval identifier (e.g. 'ssv3')
    :param genome_ref: Either 'GRCh37' or 'GRCh38

    :return: HailTable keyed by interval
    """

    # genome references
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')

    # dict contig recode from rg38 -> rg37.
    # only autosomal and sex chromosomes
    CONTIG_RECODING_HG38_TO_HG37 = {
        contig: contig.replace('chr', '')
        for contig in rg38.contigs[:24]
    }

    # dict contig recode from rg37 -> rg38.
    # only autosomal and sex chromosomes
    CONTIG_RECODING_HG37_TO_HG38 = {
        CONTIG_RECODING_HG38_TO_HG37.get(k): k
        for k in CONTIG_RECODING_HG38_TO_HG37.keys()
    }

    # Recode contig if chromosome field in BED file miss-match with genome reference.
    if genome_ref == 'GRCh37':
        contig_recoding = CONTIG_RECODING_HG38_TO_HG37
    elif genome_ref == 'GRCh38':
        contig_recoding = CONTIG_RECODING_HG37_TO_HG38
    else:
        contig_recoding = None

    ht_intervals = hl.import_bed(bed_path,
                                 reference_genome=genome_ref,
                                 contig_recoding=contig_recoding)

    global_ann_expr = dict(
        zip(GLOBAL_ANNOTATION_FIELDS,
            (current_date(), bed_path, genome_ref, platform_label)))

    ht_intervals = (ht_intervals.annotate_globals(
        **global_ann_expr).key_by('interval').repartition(100))
    return ht_intervals
    'gnomAD_FIN_AF'
]

gnomad_exomes_af_expr = {
    f: hl.parse_float(variant_ht.vep[f])
    for f in gnomad_exomes_af_fields
}

# add gnomad exomes AF expression to annotation dict
af_ann_expr.update(gnomad_exomes_af_expr)

## annotate afs
variant_ht = (variant_ht.annotate(**af_ann_expr))

af_fields = list(af_ann_expr.keys())
variant_ht = (variant_ht.select(*af_fields))

## add global annotation
date = current_date()
global_ann_expr = {'date': date, 'af_fields': af_fields}
variant_ht = (variant_ht.annotate_globals(**global_ann_expr))

## export af table

# write to Hail table
output_path_ht = f'{nfs_dir}/hail_data/hts/chd_ukbb.variants.af.annotations.external.{date}.ht'
variant_ht = (variant_ht.checkpoint(output_path_ht, overwrite=True))

# write to TSV file
(variant_ht.export(f'{output_path_ht}.tsv.bgz'))
Beispiel #4
0
def liftover_intervals(t: hl.Table,
                       keep_missing_interval: bool = False) -> hl.Table:
    """
    Liftover locus in intervals from one coordinate system (hg37) to another (hg38)

    # Example input table description
    #
    # ----------------------------------------
    # Global fields:
    #     None
    # ----------------------------------------
    # Row fields:
    #     'interval': interval<locus<GRCh37>>
    # ----------------------------------------
    # Key: ['interval']
    # ----------------------------------------


    :param t: Table of intervals on GRCh37
    :param keep_missing_interval: If True, keep missing (non-lifted) intervals in the output Table.
    :return: Table with intervals lifted over GRCh38 added.
    """

    rg37 = hl.get_reference("GRCh37")
    rg38 = hl.get_reference("GRCh38")

    if not rg37.has_liftover("GRCh38"):
        rg37.add_liftover(
            f'{nfs_dir}/resources/liftover/grch37_to_grch38.over.chain.gz',
            rg38)

    t = t.annotate(
        start=hl.liftover(t.interval.start, "GRCh38"),
        end=hl.liftover(t.interval.end, "GRCh38"),
    )

    t = t.filter((t.start.contig == "chr" + t.interval.start.contig)
                 & (t.end.contig == "chr" + t.interval.end.contig))

    t = t.key_by()

    t = (t.select(interval=hl.locus_interval(t.start.contig,
                                             t.start.position,
                                             t.end.position,
                                             reference_genome=rg38,
                                             invalid_missing=True),
                  interval_hg37=t.interval))

    # bad intervals
    missing = t.aggregate(hl.agg.counter(~hl.is_defined(t.interval)))
    logger.info(
        f"Number of missing intervals: {missing[True]} out of {t.count()}...")

    # update globals annotations
    global_ann_expr = {
        'date': current_date(),
        'reference_genome': 'GRCh38',
        'was_lifted': True
    }
    t = t.annotate_globals(**global_ann_expr)

    if not keep_missing_interval:
        logger.info(f"Filtering out {missing[True]} missing intervals...")
        t = t.filter(hl.is_defined(t.interval), keep=True)

    return t.key_by("interval")
def main(args):
    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # import MT
    mt = hl.read_matrix_table(args.mt_input_path)

    n_variants, n_samples = mt.count()

    # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles>
    # with all variants in the dataset and no extra fields (a.k.a reference table).
    tb_variants = (mt.select_rows().rows())

    # compute overall coverage
    if args.compute_overall_coverage:
        logger.info(
            f"Computing coverage stats for {n_variants} variant over {n_samples} samples..."
        )
        ht_cov_overall = compute_coverage_stats(mt=mt,
                                                reference_ht=tb_variants)

        tb_variants = (tb_variants.annotate(
            overall=ht_cov_overall[tb_variants.key]))

    # compute coverage stratified by phenotype status (expected binary)
    # force the input MT to have a case_control bool filed (is_case)
    # ***
    if args.compute_phe_coverage:
        logger.info(
            f"Computing coverage stats stratified by phenotype status...")

        # Annotate sample meta info
        # Note: Temporal solution, better to import annotated MT
        mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key]))

        mt = (mt.annotate_cols(
            case_control=hl.if_else(mt[args.phe_field], 'case', 'control')))

        strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control'])))

        dict_strata_ht = {
            s:
            compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s),
                                   reference_ht=tb_variants)
            for s in strata
        }

        for k in dict_strata_ht.keys():
            _tb = dict_strata_ht.get(k)
            tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]})

        if args.run_binomial_test:
            logger.info(f"Running binomial test...")
            # perform a binomial test on coverage and case/control status
            # DOI: https://doi.org/10.1002/acn3.582
            tb_binomial = (tb_variants.annotate(
                n_cases_over_10=hl.int(tb_variants.case.over_10 * 100),
                n_controls_over_10=hl.int(tb_variants.control.over_10 * 100),
                total_cases=tb_variants.case.n_samples,
                total_controls=tb_variants.control.n_samples,
            ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases',
                     'total_controls'))

            binomial_expr = {
                'p_value':
                hl.binom_test(
                    x=tb_binomial.n_cases_over_10,
                    n=tb_binomial.n_cases_over_10 +
                    tb_binomial.n_controls_over_10,
                    p=tb_binomial.total_cases /
                    (tb_binomial.total_cases + tb_binomial.total_controls),
                    alternative='two.sided')
            }

            tb_binomial = (tb_binomial.annotate(**binomial_expr))

            tb_variants = (tb_variants.annotate(
                binomial_stats=tb_binomial[tb_variants.key]))

    # make coverage filter expressions
    # Note: the default number of reads is set to 10X
    logger.info(f"Assigning per site coverage filters...")

    significant_level = args.pvalue_threshold
    min_sample_prop = args.min_sample_proportion

    coverage_filter_dict_expr = {}

    if args.compute_overall_coverage:
        coverage_filter_dict_expr.update({
            'overall_hard_cutoff':
            hl.if_else((tb_variants.overall.over_10 >= min_sample_prop),
                       "pass", "fail")
        })
    if args.compute_phe_coverage:
        # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016
        coverage_filter_dict_expr.update({
            'phe_hard_cutoff':
            hl.if_else((tb_variants.case.over_10 >= min_sample_prop) &
                       (tb_variants.control.over_10 >= min_sample_prop),
                       "concordant", "discordant")
        })
    if args.run_binomial_test:
        coverage_filter_dict_expr.update({
            'phe_binomial':
            hl.if_else(tb_variants.binomial_stats.p_value < significant_level,
                       'dependent', 'independent')
        })

    # annotate coverage filters
    tb_variants = (tb_variants.annotate(coverage_filter=hl.struct(
        **coverage_filter_dict_expr)))

    # add useful global annotations to final coverage stats ht
    # as well as affected/non-affected summary counts per filters
    global_ann_dict_expr = {
        'date': current_date(),
        'mt_path': args.mt_input_path,
        'min_sample_prop': min_sample_prop
    }
    if args.compute_overall_coverage:
        global_ann_dict_expr.update({
            'overall_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(
                    tb_variants.coverage_filter.overall_hard_cutoff))
        })
    if args.compute_phe_coverage:
        global_ann_dict_expr.update({
            'phe_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff))
        })
    if args.run_binomial_test:
        global_ann_dict_expr.update({
            'phe_binomial':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_binomial)),
            'binomial_pvalue_cutoff':
            significant_level if args.run_binomial_test else hl.float('')
        })

    tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr))

    # check
    tb_variants.globals.show()
    tb_variants.describe()

    # write HT
    tb_variants = tb_variants.checkpoint(output=args.ht_output_path,
                                         overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (tb_variants.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            'Writing qc-filtered MT filtered to external maf with to disk...')
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Burden Test ######

    logger.info('Running burden test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt.annotate_cols(**tb_sample[mt.s]))

    mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control']))

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # print('Number of samples/variants: ')
    # print(mt.count())

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(mt.GT.is_het()) >= 2,
        homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) |
        (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist())
    mts = []

    if args.homs:
        # select homs genotypes.

        mt_homs = (mt_grouped.select_entries(
            mac=mt_grouped.homs).annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # select compound hets (chets) genotypes.
        mt_chets = (mt_grouped.select_entries(
            mac=mt_grouped.chets).annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # select chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.select_entries(
            mac=mt_grouped.homs_chets).annotate_rows(
                agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # select hets genotypes
        mt_hets = (mt_grouped.select_entries(
            mac=mt_grouped.hets).annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_grouped = hl.MatrixTable.union_rows(*mts)

    # Generate table of counts
    tb_gene = (mt_grouped.annotate_rows(
        n_cases=hl.agg.filter(mt_grouped['phe.is_case'],
                              hl.agg.sum(mt_grouped.mac)),
        n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                  hl.agg.sum(mt_grouped.mac)),
        n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                     hl.agg.sum(mt_grouped.mac)),
        n_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                 hl.agg.sum(mt_grouped.mac)),
        n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()),
        n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                        hl.agg.count()),
        n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                           hl.agg.count()),
        n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                       hl.agg.count())).rows())

    # run fet stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []
    for proband in analysis:
        logger.info(f'Running test for {proband}...')
        colCases = None
        colTotalCases = None
        colControls = 'n_controls'
        colTotalControls = 'n_total_controls'
        if proband == 'all_cases':
            colCases = 'n_cases'
            colTotalCases = 'n_total_cases'
        if proband == 'syndromic':
            colCases = 'n_syndromic'
            colTotalCases = 'n_total_syndromic'
        if proband == 'nonsyndromic':
            colCases = 'n_nonsyndromic'
            colTotalCases = 'n_total_nonsyndromic'

        tb_fet = compute_fisher_exact(tb=tb_gene,
                                      n_cases_col=colCases,
                                      n_control_col=colControls,
                                      total_cases_col=colTotalCases,
                                      total_controls_col=colTotalControls,
                                      correct_total_counts=True,
                                      root_col_name='fet',
                                      extra_fields={
                                          'analysis': proband,
                                          'maf': maf_cutoff
                                      })

        # filter out zero-count genes
        tb_fet = (tb_fet.filter(
            hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True))

        tbs.append(tb_fet)

    tb_final = hl.Table.union(*tbs)

    tb_final.describe()

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()
Beispiel #7
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            f'Writing sample/variant QCed MT with rare variants at maf: {args.af_max_threshold}.'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Run gene-set burden logistic regression ######

    logger.info('Running gene-set burden logistic regression test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group and gene clusters before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # First-step aggregation:
    # Generate a sample per gene/variant_type (binary) matrix aggregating genotypes as follow:
    #
    #   a) entry: hets
    #   b) entry: homs
    #   c) entry: chets (compound hets)

    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(
            mt.GT.is_het()) >= 2).repartition(100).persist())

    # Import/generate gene clusters
    clusters = hl.import_table(args.set_file,
                               no_header=True,
                               delimiter="\t",
                               min_partitions=50,
                               impute=False)
    clusters = generate_clusters_map(clusters)

    # Annotate gene-set info
    mt_grouped = (mt_grouped.annotate_rows(**clusters[mt_grouped.SYMBOL]))

    # Explode nested csq_group before grouping
    mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_id))

    # filter rows with defined consequence and gene-set name
    mt_grouped = (mt_grouped.filter_rows(
        hl.is_defined(mt_grouped.csq_group)
        & hl.is_defined(mt_grouped.cluster_id)))

    # 2. Second-step aggregation
    # Generate a sample per gene-sets/variant type matrix aggregating genotypes as follow:
    # if dominant -> sum hets (default)
    # if recessive -> sum (homs)
    # if recessive (a) -> sum (chets)
    # if recessive (b) -> sum (chets and/or homs)

    mts = []

    if args.homs:
        # Group mt by gene-sets/csq_group aggregating homs genotypes.
        mt_homs = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.homs))).repartition(
                    100).persist().annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # Group mt by gene-sets/csq_group aggregating compound hets (chets) genotypes.
        mt_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.chets))).repartition(
                    100).persist().annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # Group mt by gene-sets/csq_group aggregating chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(mac=hl.int(
                hl.agg.count_where(mt_grouped.chets
                                   | mt_grouped.homs))).repartition(100).
                         persist().annotate_rows(agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # Group mt by gene-sets/csq_group aggregating hets genotypes (default)
        mt_hets = (mt_grouped.group_rows_by(
            mt_grouped.csq_group, mt_grouped.cluster_id).aggregate(
                mac=hl.int(hl.agg.sum(mt_grouped.hets))).repartition(
                    100).persist().annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_joint = hl.MatrixTable.union_rows(*mts)

    ## Add samples annotations
    # annotate sample covs
    covariates = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_covariates.ht')
    mt_joint = (mt_joint.annotate_cols(**covariates[mt_joint.s]))

    # annotate case/control phenotype info
    tb_sample = get_sample_meta_data()
    mt_joint = (mt_joint.annotate_cols(**tb_sample[mt_joint.s]))

    mt_joint = (mt_joint.filter_cols(mt_joint['phe.is_case']
                                     | mt_joint['phe.is_control']))

    ## Run logistic regression stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []

    covs = ['sex', 'PC1', 'PC2', 'PC3', 'PC4', 'PC5']

    for proband in analysis:
        logger.info(f'Running burden test for {proband}...')

        mt_tmp = hl.MatrixTable

        if proband == 'all_cases':
            mt_tmp = mt_joint
        if proband == 'syndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_nonsyndromic'])
        if proband == 'nonsyndromic':
            mt_tmp = mt_joint.filter_cols(~mt_joint['phe.is_syndromic'])

        tb_logreg = logistic_regression(mt=mt_tmp,
                                        x_expr='mac',
                                        response='phe.is_case',
                                        covs=covs,
                                        pass_through=['agg_genotype'],
                                        extra_fields={
                                            'analysis': proband,
                                            'maf': maf_cutoff,
                                            'covs': '|'.join(covs)
                                        })

        tbs.append(tb_logreg)

    tb_final = hl.Table.union(*tbs)

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.logreg_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()