Exemple #1
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import unfiltered split MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # Compute stratified sample_qc (biallelic and multi-allelic sites)
    sample_qc_ht = compute_sample_qc(mt)

    # Write HT with sample QC metrics
    sample_qc_ht = sample_qc_ht.checkpoint(get_sample_qc_ht_path(
        dataset=args.exome_cohort, part='high_conf_autosomes'),
                                           overwrite=args.overwrite,
                                           _read_if_exists=not args.overwrite)

    # annotate sample population and platform qc info
    pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc'))
    platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca'))

    ann_expr = {
        'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop,
        'qc_platform': platform_qc[sample_qc_ht.s].qc_platform
    }

    sample_qc_ht = sample_qc_ht.annotate(**ann_expr)

    # Export HT to file
    if args.write_to_file:
        (sample_qc_ht.flatten().export(
            f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='high_conf_autosomes')}.tsv.bgz"
        ))

    # Apply stratified sample filters based on defined QC metrics
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]

    print('Computing stratified metrics filters...')
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        sample_qc_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])

    exome_pop_platform_filter_ht = exome_pop_platform_filter_ht.checkpoint(
        get_sample_qc_ht_path(dataset=args.exome_cohort,
                              part='stratified_metrics_filter'),
        overwrite=args.overwrite,
        _read_if_exists=not args.overwrite)

    # Export HT to file
    if args.write_to_file:
        (exome_pop_platform_filter_ht.export(
            f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')}.tsv.bgz"
        ))

    # Stop Hail
    hl.stop()

    print("Finished!")
Exemple #2
0
def apply_sample_qc_filtering(mt: hl.MatrixTable,
                              keep_rare_variants: bool = True,
                              maf_threshold: float = 0.01) -> hl.MatrixTable:
    """
    Apply sample QC filtering, compute internal allelic frequencies on samples passing qc and
    adjusted phenotypes. Optionally, return MT filtered to rare variants.

    :param mt: hl.MatrixTable
    :param keep_rare_variants: Filter MT to rare variants
    :param maf_threshold: allelic frequency cutoff
    :return: hl.MatrixTable
    """
    # import variant qc final table
    sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc'))
    sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters))
    mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key])))
    # compute cohort-specific (internal) allelic frequencies on samples passing qc
    mt = (mt.annotate_rows(gt_stats=hl.agg.call_stats(mt.GT, mt.alleles)))
    mt = (mt.annotate_rows(internal_af=mt.gt_stats.AF[1],
                           internal_ac=mt.gt_stats.AC[1]))
    # filter out common variants base don internal af
    if keep_rare_variants:
        mt = (mt.filter_rows(af_filter_expr(mt, 'internal_af', maf_threshold)))

    return mt
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import unfiltered split MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # Compute stratified sample_qc (biallelic and multi-allelic sites)
    sample_qc_ht = compute_sample_qc(mt)

    # Write HT with sample QC metrics
    output_path = (
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_qc.high_conf.autosomes.cds.capture_intervals.rare_common.ht'
    )

    sample_qc_ht = sample_qc_ht.checkpoint(output_path,
                                           overwrite=args.overwrite,
                                           _read_if_exists=not args.overwrite)

    # annotate sample population and platform qc info
    pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc'))
    platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca'))

    ann_expr = {
        'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop,
        'qc_platform': platform_qc[sample_qc_ht.s].qc_platform
    }

    sample_qc_ht = sample_qc_ht.annotate(**ann_expr)

    # Export HT to file
    if args.write_to_file:
        (sample_qc_ht.flatten().export(f"{output_path}.tsv.bgz"))

    # Stop Hail
    hl.stop()

    print("Finished!")
def main(args):

    # nfs_dir = 'file:///home/ubuntu/data'

    hl.init(default_reference=args.default_reference)

    logger.info("Importing data...")

    # import unfiltered MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # keep bi-allelic variants
    mt = (mt
          .filter_rows(bi_allelic_expr(mt), keep=True)
          )

    # read intervals for filtering variants (used mainly for exomes)
    def _get_interval_table(interval: str) -> Union[None, hl.Table]:
        return get_capture_interval_ht(name=interval,
                                       reference=args.default_reference) if interval is not None else interval

    ht = compute_mean_coverage(mt=mt,
                               normalization_contig=args.normalization_contig,
                               included_calling_intervals=_get_interval_table(args.interval_to_include),
                               excluded_calling_intervals=_get_interval_table(args.interval_to_exclude),
                               chr_x=args.chr_x,
                               chr_y=args.chr_y)

    logger.info("Exporting data...")

    # write HT
    output_ht_path = get_sample_qc_ht_path(part='sex_chrom_coverage')
    ht.write(output=output_ht_path,
             overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht
         .export(f'{output_ht_path}.tsv.bgz')
         )

    hl.stop()

    print("Done!")
Exemple #5
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

        # Read MT from 1kgenome and keep only locus defined in interval
        mt_1kg = get_1kg_mt(args.default_reference)

        # Joining dataset (inner join). Keep only 'GT' entry field
        mt_joint = (mt.select_entries('GT').union_cols(
            mt_1kg.select_entries('GT'), row_join_type='inner'))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        mt_joint = (mt_joint.filter_rows(
            bi_allelic_expr(mt_joint)
            & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1])
            & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)).
                    naive_coalesce(1000))

        logger.info(
            "Checkpoint: writing joint filtered MT before LD pruning...")
        mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='joint_1kg_high_callrate_common_snp_biallelic'),
                                       overwrite=True)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt_joint.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt_joint = (mt_joint.filter_rows(
            hl.is_defined(pruned_variant_table[mt_joint.row_key])))

        logger.info("Writing filtered joint MT with variants in LD pruned...")
        (mt_joint.write(get_qc_mt_path(
            dataset=args.exome_cohort + '_1kg',
            part='joint_high_callrate_common_snp_biallelic',
            split=True,
            ld_pruned=True),
                        overwrite=args.overwrite))

    logger.info("Importing filtered joint MT...")
    mt_joint = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort + '_1kg',
                       part='joint_high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA with {mt_joint.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT,
                                                      k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")  # TODO: save eigenvalues?

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='joint_pca_1kg')
    pca_table.write(output=output_ht_path)

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("Done!")
Exemple #6
0
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=args.exome_cohort,
                           part='unphase_adj_genotypes',
                           split=True))

        # filter to samples passing QC filters
        logger.info(
            "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..."
        )
        sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc'))
        sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters))
        mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key])))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        maf = args.maf_threshold
        mt = (mt.filter_rows(
            bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1])
            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf)
            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce(
                500))

        logger.info("Checkpoint: writing filtered MT before LD pruning...")
        mt = mt.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='high_callrate_common_snp_biallelic'),
                           overwrite=args.overwrite)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key])))

        logger.info("Writing filtered MT with ld-pruned variants...")
        (mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                                 part='high_callrate_common_snp_biallelic',
                                 split=True,
                                 ld_pruned=True),
                  overwrite=args.overwrite))

    logger.info("Importing filtered ld-pruned MT...")
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA on {mt.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")

    # Annotate eigenvalues as global field
    pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues}))

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = args.output_ht
    pca_table = (pca_table.checkpoint(output=output_ht_path,
                                      overwrite=args.overwrite))

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("PCA pipeline finalised...")
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import raw split MT
    mt = (get_mt_data(dataset=args.exome_cohort, part='raw',
                      split=True).select_cols())

    ht = (mt.cols().key_by('s'))

    # Annotate samples filters
    sample_qc_filters = {}

    # 1. Add sample hard filters annotation expr
    sample_qc_hard_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='hard_filters'))

    sample_qc_filters.update(
        {'hard_filters': sample_qc_hard_filters_ht[ht.s]['hard_filters']})

    # 2. Add population qc filters annotation expr
    sample_qc_pop_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort, part='population_qc'))

    sample_qc_filters.update(
        {'predicted_pop': sample_qc_pop_ht[ht.s]['predicted_pop']})

    # 3. Add relatedness filters annotation expr
    related_samples_to_drop = get_related_samples_to_drop()
    related_samples = hl.set(
        related_samples_to_drop.aggregate(
            hl.agg.collect_as_set(related_samples_to_drop.node.id)))

    sample_qc_filters.update({'is_related': related_samples.contains(ht.s)})

    # 4. Add stratified sample qc (population/platform) annotation expr
    sample_qc_pop_platform_filters_ht = hl.read_table(
        get_sample_qc_ht_path(dataset=args.exome_cohort,
                              part='stratified_metrics_filter'))

    sample_qc_filters.update({
        'pop_platform_filters':
        sample_qc_pop_platform_filters_ht[ht.s]['pop_platform_filters']
    })

    ht = (ht.annotate(**sample_qc_filters))

    # Final sample qc filter joint expression
    final_sample_qc_ann_expr = {
        'pass_filters':
        hl.cond((hl.len(ht.hard_filters) == 0) &
                (hl.len(ht.pop_platform_filters) == 0) &
                (ht.predicted_pop == 'EUR') & ~ht.is_related, True, False)
    }
    ht = (ht.annotate(**final_sample_qc_ann_expr))

    logger.info('Writing final sample qc HT to disk...')
    output_path_ht = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='final_qc')

    ht = ht.checkpoint(output_path_ht, overwrite=args.overwrite)

    # Export final sample QC annotations to file
    if args.write_to_file:
        (ht.export(f'{output_path_ht}.tsv.bgz'))

    ## Release final unphase MT with adjusted genotypes filtered
    mt = unphase_mt(mt)
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj).select_entries('GT', 'DP', 'GQ', 'adj')

    logger.info('Writing unphase MT with adjusted genotypes to disk...')
    # write MT
    mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                            part='unphase_adj_genotypes',
                            split=True),
             overwrite=args.overwrite)

    # Stop Hail
    hl.stop()

    print("Finished!")