コード例 #1
0
ファイル: variant_qc.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Read Hail MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # compute sample and variant qc
    mt = hl.variant_qc(mt)

    # write variant qc hailtable
    tb_variant_qc = (mt
                     .select_rows('variant_qc')
                     .rows()
                     .flatten()
                     .key_by('locus', 'alleles')
                     )
    output_path_ht = f'{args.ht_output_path}_variant_qc.ht'
    tb_variant_qc.write(output=output_path_ht)

    if args.write_to_file:
        (hl.read_table(output_path_ht)
         .export(f'{output_path_ht}_variant_qc.tsv.bgz')
         )

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #2
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import unfiltered split MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # Compute stratified sample_qc (biallelic and multi-allelic sites)
    sample_qc_ht = compute_sample_qc(mt)

    # Write HT with sample QC metrics
    sample_qc_ht = sample_qc_ht.checkpoint(get_sample_qc_ht_path(
        dataset=args.exome_cohort, part='high_conf_autosomes'),
                                           overwrite=args.overwrite,
                                           _read_if_exists=not args.overwrite)

    # annotate sample population and platform qc info
    pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc'))
    platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca'))

    ann_expr = {
        'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop,
        'qc_platform': platform_qc[sample_qc_ht.s].qc_platform
    }

    sample_qc_ht = sample_qc_ht.annotate(**ann_expr)

    # Export HT to file
    if args.write_to_file:
        (sample_qc_ht.flatten().export(
            f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='high_conf_autosomes')}.tsv.bgz"
        ))

    # Apply stratified sample filters based on defined QC metrics
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]

    print('Computing stratified metrics filters...')
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        sample_qc_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])

    exome_pop_platform_filter_ht = exome_pop_platform_filter_ht.checkpoint(
        get_sample_qc_ht_path(dataset=args.exome_cohort,
                              part='stratified_metrics_filter'),
        overwrite=args.overwrite,
        _read_if_exists=not args.overwrite)

    # Export HT to file
    if args.write_to_file:
        (exome_pop_platform_filter_ht.export(
            f"{get_sample_qc_ht_path(dataset=args.exome_cohort, part='stratified_metrics_filter')}.tsv.bgz"
        ))

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #3
0
def download_data():
    global _data_dir, _mt
    _data_dir = os.environ.get('HAIL_BENCHMARK_DIR',
                               '/tmp/hail_benchmark_data')
    print(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), [
        'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht',
        'table_10M_par_100.ht', 'table_10M_par_10.ht',
        'gnomad_dp_simulation.mt', 'many_strings_table.ht'
    ])
    if not all(os.path.exists(file) for file in files):
        hl.init()  # use all cores

        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        print('files not found - downloading...', end='', flush=True)
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
            vcf)
        print('done', flush=True)
        print('importing...', end='', flush=True)
        hl.import_vcf(vcf, min_partitions=16).write(os.path.join(
            _data_dir, 'profile.mt'),
                                                    overwrite=True)

        ht = hl.utils.range_table(
            10_000_000,
            1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1)
                              for i in range(5)})
        ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'),
                           overwrite=True)
        ht = ht.naive_coalesce(100).checkpoint(os.path.join(
            _data_dir, 'table_10M_par_100.ht'),
                                               overwrite=True)
        ht.naive_coalesce(10).write(os.path.join(_data_dir,
                                                 'table_10M_par_10.ht'),
                                    overwrite=True)

        mt = hl.utils.range_matrix_table(n_rows=250_000,
                                         n_cols=1_000,
                                         n_partitions=32)
        mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3))
        mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'),
                 overwrite=True)

        print('downloading many strings table...')
        mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz')
        mst_ht = os.path.join(_data_dir, 'many_strings_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz',
            mst_tsv)
        print('importing...')
        hl.import_table(mst_tsv).write(mst_ht, overwrite=True)
        hl.stop()
    else:
        print('all files found.', flush=True)
コード例 #4
0
def main(args):

    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # input MT
    mt = hl.read_matrix_table(args.mt_input_path)

    # filter high-quality genotype
    # mt = filter_genotypes_ab(mt)

    # import capture interval table (intersect)
    intervals = hl.read_table(args.ht_intervals)

    # generate an interval x sample MT by computing per intervals callrate
    mt_callrate = compute_callrate_mt(mt=mt, intervals_ht=intervals)

    # run pca
    eigenvalues, ht_pca, _ = run_platform_pca(
        callrate_mt=mt_callrate,
        binarization_threshold=args.binarization_threshold)

    # normalize eigenvalues (0-100)
    eigenvalues_norm = [x / sum(eigenvalues) * 100 for x in eigenvalues]

    # compute eigenvalues cumulative sum
    ev_cumsum = hl.array_scan(lambda i, j: i + j, 0,
                              hl.array(eigenvalues_norm))

    # getting optimal number of PCs (those which explain 99% of the variance)
    n_optimal_pcs = hl.eval(hl.len(ev_cumsum.filter(lambda x: x < 99.0)))

    logger.info(
        f"Keep only principal components which explain up to 99% of the variance. Number of optimal PCs found: {n_optimal_pcs}"
    )

    # filter out uninformative PCs
    ht_pca = ht_pca.annotate(scores=ht_pca.scores[:n_optimal_pcs])

    # apply unsupervised clustering on PCs to infer samples platform
    ht_platform = assign_platform_from_pcs(
        platform_pca_scores_ht=ht_pca,
        pc_scores_ann='scores',
        hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
        hdbscan_min_samples=args.hdbscan_min_cluster_size)

    ht_platform.show()

    # write HT
    ht_platform.write(output=args.ht_output_path, overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht_platform.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
コード例 #5
0
ファイル: test_context.py プロジェクト: chrisvittal/hail
    def test_init_hail_context_twice(self):
        hl.init(idempotent=True)  # Should be no error
        hl.stop()

        hl.init(idempotent=True)
        hl.experimental.define_function(lambda x: x + 2, hl.tint32)
        # ensure functions are cleaned up without error
        hl.stop()

        hl.init(idempotent=True)  # Should be no error
        hl.init(hl.spark_context(), idempotent=True)  # Should be no error
コード例 #6
0
ファイル: utils.py プロジェクト: saponas/hail
def ensure_resources(data_dir, resources):
    logging.info(f'using benchmark data directory {data_dir}')
    os.makedirs(data_dir, exist_ok=True)
    to_create = []
    for rg in resources:
        if not rg.exists(data_dir):
            to_create.append(rg)
    if to_create:
        hl.init()
        for rg in to_create:
            rg.create(data_dir)
        hl.stop()
コード例 #7
0
ファイル: vcfs2mt.py プロジェクト: enriquea/wes_hail
def main(args):

    # Start Hail on local mode
    hl.init()

    # getting list of VCF files from given path
    vcf_files_list = get_files_names(args.vcf_path, ext='vcf.gz')

    # import VCF(s) as Hail MatrixTable
    mt = hl.import_vcf(vcf_files_list, force_bgz=args.force_bgz)

    # write MatrixTable
    mt.write(output=args.output_path, overwrite=args.overwrite)

    # Stop Hail
    hl.stop()
コード例 #8
0
def main():
    p = argparse.ArgumentParser()
    p.add_argument('input_dataset', help='input VCF file')
    p.add_argument(
        '--matrixtable-file',
        help=
        'file name (includes path) of the MatrixTable for data imported from VCF input'
    )
    p.add_argument(
        '--overwrite-matrixtable',
        action='store_true',
        help='always import vcf data ignoring any existing matrixtable file')
    p.add_argument('--skip-sample-subset', action='store_true')
    p.add_argument('--ignore-missing-samples', action='store_true')
    p.add_argument('--project-guid',
                   required=True,
                   help='the guid of the target seqr project')
    p.add_argument('--gencode-release', type=int, default=29)
    p.add_argument('--gencode-path', help='path for downloaded Gencode data')
    p.add_argument('--es-host', default='localhost')
    p.add_argument('--es-port', default='9200')
    p.add_argument('--num-shards', type=int, default=1)
    p.add_argument('--block-size', type=int, default=2000)

    args = p.parse_args()

    start_time = time.time()

    hl.init()

    mt = load_mt(args.input_dataset, args.matrixtable_file,
                 args.overwrite_matrixtable)

    mt = subset_mt(args.project_guid,
                   mt,
                   skip_sample_subset=args.skip_sample_subset,
                   ignore_missing_samples=args.ignore_missing_samples)

    rows = annotate_fields(mt, args.gencode_release, args.gencode_path)

    export_to_es(rows, args.input_dataset, args.project_guid, args.es_host,
                 args.es_port, args.block_size, args.num_shards)
    logger.info(
        'Total time for subsetting, annotating, and exporting: {}'.format(
            time.time() - start_time))

    hl.stop()
コード例 #9
0
ファイル: utils.py プロジェクト: 3vivekb/hail
def download_data(data_dir, group=None):
    logging.info(f'using benchmark data directory {data_dir}')
    os.makedirs(data_dir, exist_ok=True)
    if group:
        resources = [r for r in all_resources if r.name() == group]
        if not resources:
            raise RuntimeError(f"no group {group!r}")
    else:
        resources = all_resources
    to_create = []
    for rg in resources:
        if not rg.exists(data_dir):
            to_create.append(rg)
    if to_create:
        hl.init()
        for rg in to_create:
            rg.create(data_dir)
        hl.stop()
コード例 #10
0
def main(args):

    # nfs_dir = 'file:///home/ubuntu/data'

    hl.init(default_reference=args.default_reference)

    logger.info("Importing data...")

    # import unfiltered MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # keep bi-allelic variants
    mt = (mt
          .filter_rows(bi_allelic_expr(mt), keep=True)
          )

    # read intervals for filtering variants (used mainly for exomes)
    def _get_interval_table(interval: str) -> Union[None, hl.Table]:
        return get_capture_interval_ht(name=interval,
                                       reference=args.default_reference) if interval is not None else interval

    ht = compute_mean_coverage(mt=mt,
                               normalization_contig=args.normalization_contig,
                               included_calling_intervals=_get_interval_table(args.interval_to_include),
                               excluded_calling_intervals=_get_interval_table(args.interval_to_exclude),
                               chr_x=args.chr_x,
                               chr_y=args.chr_y)

    logger.info("Exporting data...")

    # write HT
    output_ht_path = get_sample_qc_ht_path(part='sex_chrom_coverage')
    ht.write(output=output_ht_path,
             overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht
         .export(f'{output_ht_path}.tsv.bgz')
         )

    hl.stop()

    print("Done!")
コード例 #11
0
ファイル: vcf2mt.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):

    # Start Hail on local mode
    hl.init(default_reference='GRCh38')

    # getting list of VCF files from given path
    # vcf_files_list = get_files_names(args.vcf_path, ext='vcf.gz')

    # import VCF(s) as Hail MatrixTable
    mt = hl.import_vcf(path=args.vcf_path, force_bgz=args.force_bgz)

    if args.split_multi:
        mt = hl.split_multi_hts(mt)

    # write MatrixTable
    mt.write(output=args.output_path, overwrite=args.overwrite)

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #12
0
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import unfiltered split MT
    mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

    # Compute stratified sample_qc (biallelic and multi-allelic sites)
    sample_qc_ht = compute_sample_qc(mt)

    # Write HT with sample QC metrics
    output_path = (
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.sample_qc.high_conf.autosomes.cds.capture_intervals.rare_common.ht'
    )

    sample_qc_ht = sample_qc_ht.checkpoint(output_path,
                                           overwrite=args.overwrite,
                                           _read_if_exists=not args.overwrite)

    # annotate sample population and platform qc info
    pop_qc = hl.read_table(get_sample_qc_ht_path(part='population_qc'))
    platform_qc = hl.read_table(get_sample_qc_ht_path(part='platform_pca'))

    ann_expr = {
        'qc_pop': pop_qc[sample_qc_ht.s].predicted_pop,
        'qc_platform': platform_qc[sample_qc_ht.s].qc_platform
    }

    sample_qc_ht = sample_qc_ht.annotate(**ann_expr)

    # Export HT to file
    if args.write_to_file:
        (sample_qc_ht.flatten().export(f"{output_path}.tsv.bgz"))

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #13
0
def main(args):
    # init hail
    hl.init(default_reference=args.default_ref_genome)

    # import MT
    mt = hl.read_matrix_table(args.mt_input_path)

    n_variants, n_samples = mt.count()

    # Getting variant table. Basically, a table keyed by <locus> or <locus, alleles>
    # with all variants in the dataset and no extra fields (a.k.a reference table).
    tb_variants = (mt.select_rows().rows())

    # compute overall coverage
    if args.compute_overall_coverage:
        logger.info(
            f"Computing coverage stats for {n_variants} variant over {n_samples} samples..."
        )
        ht_cov_overall = compute_coverage_stats(mt=mt,
                                                reference_ht=tb_variants)

        tb_variants = (tb_variants.annotate(
            overall=ht_cov_overall[tb_variants.key]))

    # compute coverage stratified by phenotype status (expected binary)
    # force the input MT to have a case_control bool filed (is_case)
    # ***
    if args.compute_phe_coverage:
        logger.info(
            f"Computing coverage stats stratified by phenotype status...")

        # Annotate sample meta info
        # Note: Temporal solution, better to import annotated MT
        mt = (mt.annotate_cols(**get_sample_meta_data()[mt.col_key]))

        mt = (mt.annotate_cols(
            case_control=hl.if_else(mt[args.phe_field], 'case', 'control')))

        strata = (mt.aggregate_cols(hl.agg.collect_as_set(mt['case_control'])))

        dict_strata_ht = {
            s:
            compute_coverage_stats(mt=mt.filter_cols(mt['case_control'] == s),
                                   reference_ht=tb_variants)
            for s in strata
        }

        for k in dict_strata_ht.keys():
            _tb = dict_strata_ht.get(k)
            tb_variants = tb_variants.annotate(**{k: _tb[tb_variants.key]})

        if args.run_binomial_test:
            logger.info(f"Running binomial test...")
            # perform a binomial test on coverage and case/control status
            # DOI: https://doi.org/10.1002/acn3.582
            tb_binomial = (tb_variants.annotate(
                n_cases_over_10=hl.int(tb_variants.case.over_10 * 100),
                n_controls_over_10=hl.int(tb_variants.control.over_10 * 100),
                total_cases=tb_variants.case.n_samples,
                total_controls=tb_variants.control.n_samples,
            ).select('n_cases_over_10', 'n_controls_over_10', 'total_cases',
                     'total_controls'))

            binomial_expr = {
                'p_value':
                hl.binom_test(
                    x=tb_binomial.n_cases_over_10,
                    n=tb_binomial.n_cases_over_10 +
                    tb_binomial.n_controls_over_10,
                    p=tb_binomial.total_cases /
                    (tb_binomial.total_cases + tb_binomial.total_controls),
                    alternative='two.sided')
            }

            tb_binomial = (tb_binomial.annotate(**binomial_expr))

            tb_variants = (tb_variants.annotate(
                binomial_stats=tb_binomial[tb_variants.key]))

    # make coverage filter expressions
    # Note: the default number of reads is set to 10X
    logger.info(f"Assigning per site coverage filters...")

    significant_level = args.pvalue_threshold
    min_sample_prop = args.min_sample_proportion

    coverage_filter_dict_expr = {}

    if args.compute_overall_coverage:
        coverage_filter_dict_expr.update({
            'overall_hard_cutoff':
            hl.if_else((tb_variants.overall.over_10 >= min_sample_prop),
                       "pass", "fail")
        })
    if args.compute_phe_coverage:
        # DOI: https://doi.org/10.1016/j.ajhg.2018.08.016
        coverage_filter_dict_expr.update({
            'phe_hard_cutoff':
            hl.if_else((tb_variants.case.over_10 >= min_sample_prop) &
                       (tb_variants.control.over_10 >= min_sample_prop),
                       "concordant", "discordant")
        })
    if args.run_binomial_test:
        coverage_filter_dict_expr.update({
            'phe_binomial':
            hl.if_else(tb_variants.binomial_stats.p_value < significant_level,
                       'dependent', 'independent')
        })

    # annotate coverage filters
    tb_variants = (tb_variants.annotate(coverage_filter=hl.struct(
        **coverage_filter_dict_expr)))

    # add useful global annotations to final coverage stats ht
    # as well as affected/non-affected summary counts per filters
    global_ann_dict_expr = {
        'date': current_date(),
        'mt_path': args.mt_input_path,
        'min_sample_prop': min_sample_prop
    }
    if args.compute_overall_coverage:
        global_ann_dict_expr.update({
            'overall_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(
                    tb_variants.coverage_filter.overall_hard_cutoff))
        })
    if args.compute_phe_coverage:
        global_ann_dict_expr.update({
            'phe_hard_cutoff':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_hard_cutoff))
        })
    if args.run_binomial_test:
        global_ann_dict_expr.update({
            'phe_binomial':
            tb_variants.aggregate(
                hl.agg.counter(tb_variants.coverage_filter.phe_binomial)),
            'binomial_pvalue_cutoff':
            significant_level if args.run_binomial_test else hl.float('')
        })

    tb_variants = (tb_variants.annotate_globals(**global_ann_dict_expr))

    # check
    tb_variants.globals.show()
    tb_variants.describe()

    # write HT
    tb_variants = tb_variants.checkpoint(output=args.ht_output_path,
                                         overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (tb_variants.export(f'{args.ht_output_path}.tsv.bgz'))

    hl.stop()
コード例 #14
0
            (freq.freq[1].AF == 0),
            snp_cutoff=args.snp_cutoff,
            indel_cutoff=args.indel_cutoff,
            determine_cutoff_from_bin=False,
            aggregated_bin_ht=bin_ht,
            bin_id=bin_ht.bin,
            inbreeding_coeff_cutoff=INBREEDING_COEFF_HARD_CUTOFF,
        )
        # This column is added by the RF module based on a 0.5 threshold which doesn't correspond to what we use
        # ht = ht.drop(ht[PREDICTION_COL])
        ht.write(f'{tmp_dir}/rf_final.ht', overwrite=True)


if __name__ == "__main__":

    hl.stop()
    hl.init(default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory

    n_partitions = 500

    parser = argparse.ArgumentParser()

    parser.add_argument(
        "--run_hash",
        help=
        "Run hash. Created by --train_rf and only needed for --apply_rf without running --train_rf",
        required=False,
    )
コード例 #15
0
def main(args):
    hl.init(default_reference='GRCh38')

    transcript_field = args.transcript_field

    # import variant HT
    ht_variants_path = args.ht_variant
    ht_variants = hl.read_table(ht_variants_path).select(transcript_field)

    # import dbNSFP HT
    ht_dbnsfp_path = args.ht_dbnsfp
    ht_dbnsfp = hl.read_table(ht_dbnsfp_path)

    # annotate scores from dbNSFP
    # prediction scores fields to annotate
    score_fields = [
        f for f in ht_dbnsfp.row if f.endswith('_score') or f == 'CADD_phred'
    ]

    ht_variants = (ht_variants.annotate(**ht_dbnsfp.select(
        *score_fields)[ht_variants.key]))

    #  Match score with specific transcript
    ht_variants = (ht_variants.annotate(
        **{
            f: ht_variants[f].get(ht_variants[transcript_field])
            for f in score_fields
        }))

    # Annotate extra info from dbNSFP
    # Note: Expected extra fields (as struct) from dbNSFP: ['gnomAD', 'ExAC', '1000Gp3', 'ESP6500', 'clinvar']

    ann_expr_dict = {}

    if args.add_clinvar:
        ann_expr_dict.update(
            {'clinvar': ht_dbnsfp[ht_variants.key]['clinvar']})

    if args.add_gnomad:
        ann_expr_dict.update({'gnomAD': ht_dbnsfp[ht_variants.key]['gnomAD']})

    if args.add_exac:
        ann_expr_dict.update({'ExAC': ht_dbnsfp[ht_variants.key]['ExAC']})

    if args.add_1000Gp3:
        ann_expr_dict.update(
            {'1000Gp3': ht_dbnsfp[ht_variants.key]['1000Gp3']})

    if args.add_ESP6500:
        ann_expr_dict.update(
            {'ESP6500': ht_dbnsfp[ht_variants.key]['ESP6500']})

    if len(ann_expr_dict) > 0:
        ht_variants = (ht_variants.annotate(**ann_expr_dict))

    # write annotated table
    # write HT
    ht_variants = ht_variants.checkpoint(output=args.ht_output,
                                         overwrite=args.overwrite)

    # export to file if true
    if args.write_to_file:
        (ht_variants.export(f'{args.ht_output}.tsv.bgz'))

    hl.stop()
コード例 #16
0
def main(args):

    # Init Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_compute_pc_relate:

        if not args.skip_filter_data:
            # Read MatrixTable
            mt = hl.read_matrix_table(args.mt_input_path)

            # filter variants (bi-allelic, high-callrate, common SNPs)
            logger.info(
                f"Filtering to bi-allelic, high-callrate, common SNPs ({args.maf_threshold}) for pc_relate..."
            )

            mt = (mt.filter_rows(
                (hl.len(mt.alleles) == 2)
                & hl.is_snp(mt.alleles[0], mt.alleles[1])
                & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > args.maf_threshold)
                & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)
                & ~mt.was_split).repartition(500, shuffle=False))

            # keep only GT entry field and force to evaluate expression
            (mt.select_entries(mt.GT).write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt',
                overwrite=args.overwrite))

        mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.filtered_high_confidence_variants.mt'
        )

        if not args.skip_prune_ld:
            # LD pruning
            # Avoid filtering / missingness entries (genotypes) before run LP pruning
            # Zulip Hail support issue -> "BlockMatrix trouble when running pc_relate"
            # mt = mt.unfilter_entries()

            # Prune variants in linkage disequilibrium.
            # Return a table with nearly uncorrelated variants

            logger.info(
                f'Pruning variants in LD from MT with {mt.count_rows()} variants...'
            )

            pruned_variant_table = hl.ld_prune(mt.GT, r2=args.r2)

            # Keep LD-pruned variants
            pruned_mt = (mt.filter_rows(hl.is_defined(
                pruned_variant_table[mt.row_key]),
                                        keep=True))
            pruned_mt.write(
                f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt',
                overwrite=args.overwrite)

        pruned_mt = hl.read_matrix_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.ld_pruned.mt')
        v, s = pruned_mt.count()
        logger.info(f'{s} samples, {v} variants found in LD-pruned MT')

        pruned_mt = pruned_mt.select_entries(
            GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

        # run pc_relate method...compute all stats
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht',
            overwrite=args.overwrite)

        logger.info(f'Running PC-Relate...')
        scores = hl.read_table(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.pruned.pca_scores_for_pc_relate.ht'
        )
        relatedness_ht = hl.pc_relate(
            call_expr=pruned_mt.GT,
            min_individual_maf=args.min_individual_maf,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=args.min_kinship,
            statistics='all')

        logger.info(f'Writing relatedness table...')
        # Write/export table to file
        relatedness_ht.write(
            output=
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht',
            overwrite=args.overwrite)

        # Write PCs table to file (if specified)
        # if args.write_to_file:
        #    # Export table to file
        #    relatedness_ht.export(output=f'{args.ht_output_path}.tsv.bgz')

    # retrieve maximal independent set of related samples
    logger.info('Getting optimal set of related samples to prune...')

    relatedness_ht = hl.read_table(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.relatedness_kinship.ht')

    relatedness_ht = (relatedness_ht.flatten().rename({
        'i.s': 'i',
        'j.s': 'j'
    }).repartition(100))

    # import trios info
    fam = import_fam_ht()
    mat_ids = hl.set(fam.mat_id.collect())
    fat_ids = hl.set(fam.pat_id.collect())

    # rank samples by retention priority (e.g. cases over controls)
    tb_rank = make_sample_rank_table(get_sample_meta_data())

    # apply min kinship to consider related pairs
    relatedness_ht = (relatedness_ht.filter(relatedness_ht.kin > MIN_KINSHIP))

    # run maximal_independent_set stratified by groups
    # Note: This method fails when considering all pairs together (e.g. it removes most of the index in trios, we want
    # keep them (index) since they are mostly affected individuals rather than parents).

    # defining pairs group
    # TODO: check groups with updated fam file
    relatedness_ht = (relatedness_ht.annotate(pairs_group=hl.case().when(
        relatedness_ht.kin > 0.40, 'twins_or_dups').when(
            mat_ids.contains(relatedness_ht.i)
            | mat_ids.contains(relatedness_ht.j), 'pairs_child_mat').when(
                fat_ids.contains(relatedness_ht.i)
                | fat_ids.contains(relatedness_ht.j),
                'pairs_child_fat').default('pairs_others')))

    groups = (relatedness_ht.aggregate(
        hl.agg.collect_as_set(relatedness_ht['pairs_group'])))
    tbs = []
    for pair_group in groups:
        pair_ht = relatedness_ht.filter(
            relatedness_ht.pairs_group == pair_group)
        tb = get_related_samples_to_drop(rank_table=tb_rank,
                                         relatedness_ht=pair_ht)
        tbs.append(tb)

    related_samples_to_remove = hl.Table.union(*tbs)

    related_samples_to_remove.describe()

    related_samples_to_remove = related_samples_to_remove.checkpoint(
        f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.ht',
        overwrite=args.overwrite)

    if args.write_to_file:
        (related_samples_to_remove.flatten().export(
            f'{nfs_dir}/hail_data/sample_qc/chd_ukbb.related_samples_to_remove.tsv'
        ))

    hl.stop()
コード例 #17
0
ファイル: utils.py プロジェクト: shulik7/hail
def stopTestHailContext():
    hail.stop()
コード例 #18
0
ファイル: test_context.py プロジェクト: jigold/hail
 def test_init_hail_context_twice(self):
     hl.init(idempotent=True)  # Should be no error
     hl.stop()
     hl.init(idempotent=True)  # Should be no error
     hl.init(hl.spark_context(), idempotent=True)  # Should be no error
コード例 #19
0
 def tearDown(self):
     hl.stop()
     os.remove(self.vcf_file)
コード例 #20
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = get_mt_data(dataset=args.exome_cohort, part='unfiltered')

        # Read MT from 1kgenome and keep only locus defined in interval
        mt_1kg = get_1kg_mt(args.default_reference)

        # Joining dataset (inner join). Keep only 'GT' entry field
        mt_joint = (mt.select_entries('GT').union_cols(
            mt_1kg.select_entries('GT'), row_join_type='inner'))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        mt_joint = (mt_joint.filter_rows(
            bi_allelic_expr(mt_joint)
            & hl.is_snp(mt_joint.alleles[0], mt_joint.alleles[1])
            & (hl.agg.mean(mt_joint.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(mt_joint.GT)) > 0.99)).
                    naive_coalesce(1000))

        logger.info(
            "Checkpoint: writing joint filtered MT before LD pruning...")
        mt_joint = mt_joint.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='joint_1kg_high_callrate_common_snp_biallelic'),
                                       overwrite=True)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt_joint.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt_joint.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt_joint = (mt_joint.filter_rows(
            hl.is_defined(pruned_variant_table[mt_joint.row_key])))

        logger.info("Writing filtered joint MT with variants in LD pruned...")
        (mt_joint.write(get_qc_mt_path(
            dataset=args.exome_cohort + '_1kg',
            part='joint_high_callrate_common_snp_biallelic',
            split=True,
            ld_pruned=True),
                        overwrite=args.overwrite))

    logger.info("Importing filtered joint MT...")
    mt_joint = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort + '_1kg',
                       part='joint_high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA with {mt_joint.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt_joint.GT,
                                                      k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")  # TODO: save eigenvalues?

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = get_sample_qc_ht_path(dataset=args.exome_cohort,
                                           part='joint_pca_1kg')
    pca_table.write(output=output_ht_path)

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("Done!")
コード例 #21
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import adj genotype MT and remove
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='sample_qc_adj_genotypes',
                       split=True))

    # keep samples passing QC filtering
    mt = (mt.filter_cols(mt.pass_filters).select_cols().select_rows())

    # import variant info fields (vcf info)
    variant_info_ht = (get_vep_annotation_ht().drop('vep'))

    # Add useful annotation for variant hard filter
    ht = (
        mt.annotate_rows(
            inbreeding_coeff=variant_info_ht[mt.row_key].info.InbreedingCoeff,
            vqsr_filter=variant_info_ht[mt.row_key].filters,
            VQSLOD=variant_info_ht[mt.row_key].info.VQSLOD,
            gt_counts=hl.agg.count_where(hl.is_defined(
                mt.GT))  # expected MT filtered to high-quality GT
        ).rows())

    # 1. Apply variant hard filters
    # hard filter expression
    variant_hard_filter_expr = {
        'fail_inbreeding_coeff':
        ht.inbreeding_coeff < INBREEDING_COEFFICIENT_CUTOFF,
        'AC0': ht.gt_counts == 0
    }

    ht = (ht.annotate(**variant_hard_filter_expr))

    # 2. Apply VQSR filter
    ht = (ht.annotate(fail_vqsr=hl.len(ht.vqsr_filter) != 0))

    # 3. Apply RF filter

    # import/parse rf final HT
    ht_rf = hl.read_table(get_variant_qc_ht_path(part='rf_result'))

    ht_rf = (ht_rf.select(rf_probability_tp=ht_rf.rf_probability['TP'],
                          variant_type=ht_rf.variant_type))

    ht = (ht.annotate(**ht_rf[ht.key]))

    ht = (ht.annotate(fail_rf=hl.case().when(
        (ht.rf_probability_tp < RF_PROBABILITY_SNV_CUTOFF)
        & (ht.variant_type == 'snv'), True).when(
            (ht.rf_probability_tp < RF_PROBABILITY_INDEL_CUTOFF)
            & (ht.variant_type == 'indel'), True).default(False)))

    # 5. Apply coverage/capture interval filters

    ## gnomad genome coverage
    gnomad_coverage_ht = get_gnomad_genomes_coverage_ht().key_by()
    gnomad_coverage_ht = (gnomad_coverage_ht.annotate(locus=hl.parse_locus(
        gnomad_coverage_ht.locus, reference_genome='GRCh38')).key_by('locus'))
    ht = (ht.annotate(gnomad_cov_10X=gnomad_coverage_ht[ht.locus].over_10))
    ht = (ht.annotate(is_coveraged_gnomad_genomes=ht.gnomad_cov_10X >= 0.9))

    ## defined in capture intervals

    # filter to capture intervals (intersect)
    ht_defined_intervals = filter_capture_intervals(ht)
    ht = (ht.annotate(is_defined_capture_intervals=hl.is_defined(
        ht_defined_intervals[ht.key])))

    # 6. Summary final variant QC

    # final variant qc filter joint expression
    final_variant_qc_ann_expr = {
        'pass_variant_qc_filters':
        hl.cond(
            ~ht.fail_inbreeding_coeff & ~ht.AC0 & ~ht.fail_vqsr & ~ht.fail_rf
            & ht.is_coveraged_gnomad_genomes & ht.is_defined_capture_intervals,
            True, False)
    }
    ht = (ht.annotate(**final_variant_qc_ann_expr))

    # Counts the number of variants (snv and indels) affected by every filter and add as global field
    filter_flags = [
        'fail_inbreeding_coeff', 'AC0', 'fail_vqsr', 'fail_rf',
        'is_coveraged_gnomad_genomes', 'is_defined_capture_intervals',
        'pass_variant_qc_filters'
    ]

    summary_filter_expr = {
        v: hl.struct(
            **{
                f: hl.agg.filter(ht.variant_type == v, hl.agg.counter(ht[f]))
                for f in filter_flags
            })
        for v in ['snv', 'indel']
    }

    ht = ht.annotate_globals(
        summary_filter=ht.aggregate(summary_filter_expr, _localize=False))

    # write HT variant QC final table
    output_path = get_variant_qc_ht_path(dataset=args.exome_cohort,
                                         part='final_qc')
    ht = ht.checkpoint(output_path, overwrite=args.overwrite)

    # print filter summary
    logger.info(f'Variant QC filter summary: {ht.summary_filter.collect()}')

    # export HT to file
    if args.write_to_file:
        ht.export(f'{output_path}.tsv.bgz')

    # Stop Hail
    hl.stop()

    print("Finished!")
コード例 #22
0
ファイル: test_context.py プロジェクト: tuyanglin/hail
 def test_init_hail_context_twice(self):
     hl.init(idempotent=True)  # Should be no error
     hl.stop()
     hl.init(idempotent=True)  # Should be no error
     hl.init(hl.spark_context(), idempotent=True)  # Should be no error
コード例 #23
0
ファイル: pca.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):

    # Start Hail
    hl.init(default_reference=args.default_reference)

    if not args.skip_filter_step:
        logger.info("Importing data...")

        # import unfiltered MT
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=args.exome_cohort,
                           part='unphase_adj_genotypes',
                           split=True))

        # filter to samples passing QC filters
        logger.info(
            "Filtering MT to samples passing QC filters (hard filters, relatedness, european ancestries)..."
        )
        sample_qc_ht = hl.read_table(get_sample_qc_ht_path(part='final_qc'))
        sample_qc_ht = (sample_qc_ht.filter(sample_qc_ht.pass_filters))
        mt = (mt.filter_cols(hl.is_defined(sample_qc_ht[mt.col_key])))

        logger.info(
            "Filtering joint MT to bi-allelic, high-callrate, common SNPs...")
        maf = args.maf_threshold
        mt = (mt.filter_rows(
            bi_allelic_expr(mt) & hl.is_snp(mt.alleles[0], mt.alleles[1])
            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > maf)
            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99)).naive_coalesce(
                500))

        logger.info("Checkpoint: writing filtered MT before LD pruning...")
        mt = mt.checkpoint(get_mt_checkpoint_path(
            dataset=args.exome_cohort,
            part='high_callrate_common_snp_biallelic'),
                           overwrite=args.overwrite)

        logger.info(
            f"Running ld_prune with r2 = {args.ld_prune_r2} on MT with {mt.count_rows()} variants..."
        )
        # remove correlated variants
        pruned_variant_table = hl.ld_prune(mt.GT,
                                           r2=args.ld_prune_r2,
                                           bp_window_size=500000,
                                           memory_per_core=512)
        mt = (mt.filter_rows(hl.is_defined(pruned_variant_table[mt.row_key])))

        logger.info("Writing filtered MT with ld-pruned variants...")
        (mt.write(get_qc_mt_path(dataset=args.exome_cohort,
                                 part='high_callrate_common_snp_biallelic',
                                 split=True,
                                 ld_pruned=True),
                  overwrite=args.overwrite))

    logger.info("Importing filtered ld-pruned MT...")
    mt = hl.read_matrix_table(
        get_qc_mt_path(dataset=args.exome_cohort,
                       part='high_callrate_common_snp_biallelic',
                       split=True,
                       ld_pruned=True))

    logger.info(f"Running PCA on {mt.count_rows()} variants...")
    # run pca on merged dataset
    eigenvalues, pc_scores, _ = hl.hwe_normalized_pca(mt.GT, k=args.n_pcs)

    logger.info(f"Eigenvalues: {eigenvalues}")

    # Annotate eigenvalues as global field
    pc_scores = (pc_scores.annotate_globals(**{'eigenvalues': eigenvalues}))

    # Annotate PC array as independent fields.
    pca_table = (pc_scores.annotate(
        **
        {'PC' + str(k + 1): pc_scores.scores[k]
         for k in range(0, args.n_pcs)}).drop('scores'))

    logger.info(f"Writing HT with PCA results...")
    # write as HT
    output_ht_path = args.output_ht
    pca_table = (pca_table.checkpoint(output=output_ht_path,
                                      overwrite=args.overwrite))

    if args.write_to_file:
        (pca_table.export(f'{output_ht_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()

    print("PCA pipeline finalised...")
コード例 #24
0
def main(args):
    hl.init(default_reference=args.default_ref_genome)

    if args.run_test_mode:
        logger.info('Running pipeline on test data...')
        mt = (get_mt_data(part='raw_chr20').sample_rows(0.1))
    else:
        logger.info(
            'Running pipeline on MatrixTable wih adjusted genotypes...')
        ds = args.exome_cohort
        mt = hl.read_matrix_table(
            get_qc_mt_path(dataset=ds,
                           part='unphase_adj_genotypes',
                           split=True))

    # 1. Sample-QC filtering
    if not args.skip_sample_qc_filtering:
        logger.info('Applying per sample QC filtering...')

        mt = apply_sample_qc_filtering(mt)

        logger.info(
            'Writing sample qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt',
                       overwrite=True))

    # 2. Variant-QC filtering
    if not args.skip_variant_qc_filtering:

        logger.info('Applying per variant QC filtering...')

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt/_SUCCESS'):
            logger.info('Reading pre-existing sample qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.sample_qc_filtered.mt')
        mt = apply_variant_qc_filtering(mt)

        # write hard filtered MT to disk
        logger.info(
            'Writing variant qc-filtered mt with rare variants (internal maf 0.01) to disk...'
        )
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt',
                       overwrite=True))

    # 3. Annotate AFs

    # allelic frequency cut-off
    maf_cutoff = args.af_max_threshold

    if not args.skip_af_filtering:

        if hl.hadoop_is_file(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt/_SUCCESS'):
            logger.info(
                'Reading pre-existing sample/variant qc-filtered MT...')
            mt = hl.read_matrix_table(
                f'{hdfs_dir}/chd_ukbb.variant_qc_filtered.mt')

        # Annotate allelic frequencies from external source,
        # and compute internal AF on samples passing QC
        af_ht = get_af_annotation_ht()

        mt = (mt.annotate_rows(**af_ht[mt.row_key]))

        filter_expressions = [
            af_filter_expr(mt, 'internal_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomad_genomes_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'gnomAD_AF', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'ger_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'rumc_af', af_cutoff=maf_cutoff),
            af_filter_expr(mt, 'bonn_af', af_cutoff=maf_cutoff)
        ]

        mt = (mt.filter_rows(functools.reduce(operator.iand,
                                              filter_expressions),
                             keep=True))

        logger.info(
            'Writing qc-filtered MT filtered to external maf with to disk...')
        mt = (mt.write(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt',
                       overwrite=True))

    # 4. ##### Burden Test ######

    logger.info('Running burden test...')

    if hl.hadoop_is_file(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt/_SUCCESS'):
        logger.info(
            'Reading pre-existing sample/variant qc-filtered MT with rare variants...'
        )
        mt = hl.read_matrix_table(f'{hdfs_dir}/chd_ukbb.qc_final.rare.mt')

    ## Add VEP-annotated fields
    vep_ht = get_vep_annotation_ht()

    mt = (mt.annotate_rows(LoF=vep_ht[mt.row_key].vep.LoF,
                           Consequence=vep_ht[mt.row_key].vep.Consequence,
                           DOMAINS=vep_ht[mt.row_key].vep.DOMAINS,
                           SYMBOL=vep_ht[mt.row_key].vep.SYMBOL))

    ## Filter to bi-allelic variants
    if args.filter_biallelic:
        logger.info('Running burden test on biallelic variants...')
        mt = mt.filter_rows(bi_allelic_expr(mt))

    ## Filter to variants within protein domain(s)
    if args.filter_protein_domain:
        logger.info(
            'Running burden test on variants within protein domain(s)...')
        mt = mt.filter_rows(vep_protein_domain_filter_expr(mt.DOMAINS),
                            keep=True)

    ## Add cases/controls sample annotations
    tb_sample = get_sample_meta_data()
    mt = (mt.annotate_cols(**tb_sample[mt.s]))

    mt = (mt.filter_cols(mt['phe.is_case'] | mt['phe.is_control']))

    ## Annotate pathogenic scores
    ht_scores = get_vep_scores_ht()
    mt = mt.annotate_rows(**ht_scores[mt.row_key])

    ## Classify variant into (major) consequence groups
    score_expr_ann = {
        'hcLOF': mt.LoF == 'HC',
        'syn': mt.Consequence == 'synonymous_variant',
        'miss': mt.Consequence == 'missense_variant'
    }

    # Update dict expr annotations with combinations of variant consequences categories
    score_expr_ann.update({
        'missC': (hl.sum([(mt['vep.MVP_score'] >= MVP_THRESHOLD),
                          (mt['vep.REVEL_score'] >= REVEL_THRESHOLD),
                          (mt['vep.CADD_PHRED'] >= CADD_THRESHOLD)]) >= 2)
        & score_expr_ann.get('miss')
    })

    score_expr_ann.update({
        'hcLOF_missC':
        score_expr_ann.get('hcLOF') | score_expr_ann.get('missC')
    })

    mt = (mt.annotate_rows(csq_group=score_expr_ann))

    # Transmute csq_group and convert dict to set where the group is defined
    # (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    mt = (mt.filter_rows(hl.len(mt.csq_group) > 0))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # print('Number of samples/variants: ')
    # print(mt.count())

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(mt['SYMBOL'], mt['csq_group']).aggregate(
        hets=hl.agg.any(mt.GT.is_het()),
        homs=hl.agg.any(mt.GT.is_hom_var()),
        chets=hl.agg.count_where(mt.GT.is_het()) >= 2,
        homs_chets=(hl.agg.count_where(mt.GT.is_het()) >= 2) |
        (hl.agg.any(mt.GT.is_hom_var()))).repartition(100).persist())
    mts = []

    if args.homs:
        # select homs genotypes.

        mt_homs = (mt_grouped.select_entries(
            mac=mt_grouped.homs).annotate_rows(agg_genotype='homs'))

        mts.append(mt_homs)

    if args.chets:
        # select compound hets (chets) genotypes.
        mt_chets = (mt_grouped.select_entries(
            mac=mt_grouped.chets).annotate_rows(agg_genotype='chets'))

        mts.append(mt_chets)

    if args.homs_chets:
        # select chets and/or homs genotypes.
        mt_homs_chets = (mt_grouped.select_entries(
            mac=mt_grouped.homs_chets).annotate_rows(
                agg_genotype='homs_chets'))

        mts.append(mt_homs_chets)

    if args.hets:
        # select hets genotypes
        mt_hets = (mt_grouped.select_entries(
            mac=mt_grouped.hets).annotate_rows(agg_genotype='hets'))

        mts.append(mt_hets)

    ## Joint MatrixTables
    mt_grouped = hl.MatrixTable.union_rows(*mts)

    # Generate table of counts
    tb_gene = (mt_grouped.annotate_rows(
        n_cases=hl.agg.filter(mt_grouped['phe.is_case'],
                              hl.agg.sum(mt_grouped.mac)),
        n_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                  hl.agg.sum(mt_grouped.mac)),
        n_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                     hl.agg.sum(mt_grouped.mac)),
        n_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                 hl.agg.sum(mt_grouped.mac)),
        n_total_cases=hl.agg.filter(mt_grouped['phe.is_case'], hl.agg.count()),
        n_total_syndromic=hl.agg.filter(mt_grouped['phe.is_syndromic'],
                                        hl.agg.count()),
        n_total_nonsyndromic=hl.agg.filter(mt_grouped['phe.is_nonsyndromic'],
                                           hl.agg.count()),
        n_total_controls=hl.agg.filter(mt_grouped['phe.is_control'],
                                       hl.agg.count())).rows())

    # run fet stratified by proband type
    analysis = ['all_cases', 'syndromic', 'nonsyndromic']

    tbs = []
    for proband in analysis:
        logger.info(f'Running test for {proband}...')
        colCases = None
        colTotalCases = None
        colControls = 'n_controls'
        colTotalControls = 'n_total_controls'
        if proband == 'all_cases':
            colCases = 'n_cases'
            colTotalCases = 'n_total_cases'
        if proband == 'syndromic':
            colCases = 'n_syndromic'
            colTotalCases = 'n_total_syndromic'
        if proband == 'nonsyndromic':
            colCases = 'n_nonsyndromic'
            colTotalCases = 'n_total_nonsyndromic'

        tb_fet = compute_fisher_exact(tb=tb_gene,
                                      n_cases_col=colCases,
                                      n_control_col=colControls,
                                      total_cases_col=colTotalCases,
                                      total_controls_col=colTotalControls,
                                      correct_total_counts=True,
                                      root_col_name='fet',
                                      extra_fields={
                                          'analysis': proband,
                                          'maf': maf_cutoff
                                      })

        # filter out zero-count genes
        tb_fet = (tb_fet.filter(
            hl.sum([tb_fet[colCases], tb_fet[colControls]]) > 0, keep=True))

        tbs.append(tb_fet)

    tb_final = hl.Table.union(*tbs)

    tb_final.describe()

    # export results
    date = current_date()
    run_hash = str(uuid.uuid4())[:6]
    output_path = f'{args.output_dir}/{date}/{args.exome_cohort}.fet_burden.{run_hash}.ht'

    tb_final = (tb_final.checkpoint(output=output_path))

    if args.write_to_file:
        # write table to disk as TSV file
        (tb_final.export(f'{output_path}.tsv'))

    hl.stop()
コード例 #25
0
def main(args):

    # Initializing Hail on cluster mode
    hl.init()

    # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix
    # Read MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # Annotate csq group info per variants
    # Define consequences variant rules with hail expressions
    # TODO: check if fields exist in dataset
    csq_group_rules = {}
    if args.ptv:
        csq_group_rules.update({'PTV': mt.csq_type == 'PTV'})
    if args.pav:
        csq_group_rules.update({'PAV': mt.csq_type == 'PAV'})
    if args.syn:
        csq_group_rules.update({'SYN': mt.csq_type == 'SYN'})
    if args.cadd:
        csq_group_rules.update({
            'CADD':
            (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold)
        })

    # Annotate groups per variants
    mt = (mt.annotate_rows(csq_group=csq_group_rules))

    # Transmute csq_group and convert to set (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(
        mt.csq_group, mt.symbol).partition_hint(100).aggregate(
            n_het=hl.agg.count_where(mt.GT.is_het())))

    # force to eval all aggregation operation by writing mt to disk
    # mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY')

    if args.logistic_regression:
        # covariates list
        covs = list(args.covs_list)

        # Define x expression (entries/genotype)
        x_expr = 'n_het'

        extra_annotations = {'analysis': 'all_cases', 'covariates': covs}

        tb_stats = logistic_regression(mt=mt_grouped,
                                       x_expr=x_expr,
                                       response=args.phenotype_field,
                                       covs=covs,
                                       pass_through=[],
                                       extra_fields=extra_annotations)
        # export table
        tb_stats.export(args.output_path)

    if args.fet:
        None  # TODO: implement gene-based Fisher Exact burden test

    hl.stop()
コード例 #26
0
ファイル: utils.py プロジェクト: saponas/hail
def stop():
    global _initialized
    _initialized = False
    hl.stop()
コード例 #27
0
def main(args):
    # Init Hail with hg38 genome build as default
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=vcf_path, force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=vcf_path,
                                vep_csq_field=args.csq_field)

    if args.exclude_multi_allelic:
        # TODO: This option should skip the split_multi step...
        # Filter out multi-allelic variants. Keep only bi-allelic
        mt = filter_biallelic(mt)

    # split multi-allelic variants
    mt = hl.split_multi_hts(mt)

    # flatten nested structure (e.g. 'info') and get a HailTable with all rows fields
    tb_csq = (mt.rows().flatten().key_by('locus', 'alleles'))

    # rename info[CSQ] field to 'csq_array'.
    # Simpler field name are easier to parse later...
    tb_csq = (tb_csq.rename({'info.' + args.csq_field: 'csq_array'}))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, the keys are the field names extracted from the VCF header, the
    # values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_array=tb_csq.csq_array.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index.
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    tb_csq = (tb_csq.annotate(csq_array=hl.cond(
        tb_csq.was_split,
        tb_csq.csq_array.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                           a_index)), tb_csq.csq_array)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(ht=tb_csq, csq_array='csq_array')

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.transmute(Consequence=tb_csq.Consequence.split('&')[0]))

    # print fields overview
    tb_csq.describe()

    # drop unnecessary fields
    tb_csq = (tb_csq.drop('csq_array', 'tx'))

    # write table as HailTable to disk
    (tb_csq.write(output=args.tb_output_path))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(args.tb_output_path + '.tsv.bgz'))

    # Stop Hail
    hl.stop()
コード例 #28
0
ファイル: utils.py プロジェクト: saponas/hail
 def handler(signum, frame):
     global _timeout_state
     _timeout_state = True
     hl.stop()
     hl.init(**_init_args)
     raise BenchmarkTimeoutError()
コード例 #29
0
def main(args):
    # Initializing Hail on cluster mode
    init_hail_on_cluster(tmp_dir=HAIL_TMP_DIR,
                         log_file=HAIL_LOG_PATH,
                         local_mode=True)

    # 1- Aggregate MatrixTable per gene/consequences creating gene/csq X sample matrix

    # Read MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # Annotate csq group info per variants
    # Define consequences variant rules with hail expressions
    # TODO: check if field exist in dataset
    csq_group_rules = {}
    if args.ptv:
        csq_group_rules.update({'PTV': mt.csq_type == 'PTV'})
    if args.pav:
        csq_group_rules.update({'PAV': mt.csq_type == 'PAV'})
    if args.syn:
        csq_group_rules.update({'SYN': mt.csq_type == 'SYN'})
    if args.cadd:
        sq_group_rules.update({
            'CADD':
            (mt.csq_type == 'PAV') & (mt.cadd_phred >= args.cadd_threshold)
        })
    if args.mpc:
        csq_group_rules.update(
            {'MPC': (mt.csq_type == 'PAV') & (mt.mpc >= args.mpc_threshold)})

    # Annotate groups per variants
    mt = (mt.annotate_rows(csq_group=csq_group_rules))

    # Transmute csq_group and convert to set (easier to explode and grouping later)
    mt = (mt.transmute_rows(csq_group=hl.set(
        hl.filter(lambda x: mt.csq_group.get(x), mt.csq_group.keys()))))

    # Explode nested csq_group before grouping
    mt = (mt.explode_rows(mt.csq_group))

    # Group mt by gene/csq_group.
    mt_grouped = (mt.group_rows_by(
        mt.csq_group, mt.symbol).partition_hint(100).aggregate(
            n_het=hl.agg.count_where(mt.GT.is_het())))

    # 2- Annotate gene set information

    # Import/parsing gene cluster table
    clusters = hl.import_table(args.gene_set_path, no_header=True)

    # parsing gene set column
    clusters = (clusters.transmute(genes=hl.set(clusters['f1'].split(
        delim='[|]'))))

    clusters = (clusters.explode(clusters.genes))

    clusters = (clusters.group_by('genes').partition_hint(100).aggregate(
        cluster_name=hl.agg.collect_as_set(clusters['f0'])).key_by('genes'))

    # annotate gene set info
    mt_grouped = (mt_grouped.annotate_rows(
        cluster_name=clusters[mt_grouped.symbol].cluster_name))

    # 3- Aggregate per gene set and consequences

    # Group mt by gene set/csq_group.
    mt_grouped = (mt_grouped.explode_rows(mt_grouped.cluster_name))
    mt_grouped = (mt_grouped.group_rows_by(
        mt_grouped.cluster_name,
        mt_grouped.csq_group).partition_hint(100).aggregate(
            n_het=hl.agg.sum(mt_grouped.n_het)))

    # force to eval all aggregation operation by writing mt to disk
    mt_grouped = mt_grouped.persist(storage_level='DISK_ONLY')

    if args.logistic_regression:
        # covariates list
        covs = list(args.covs_list)

        # Define x expression (entries/genotype)
        x_expr = 'n_het'

        extra_annotations = {'analysis': 'all_cases', 'covariates': covs}

        tb_stats = logistic_regression(mt=mt_grouped,
                                       x_expr=x_expr,
                                       response=args.phenotype_field,
                                       covs=covs,
                                       pass_through=[],
                                       extra_fields=extra_annotations)
        # export table
        tb_stats.export(args.output_path)

    if args.fet:
        None  # TODO: implement Fisher Exact-based burden gene set test

    hl.stop()
コード例 #30
0
ファイル: vep_parser.py プロジェクト: enriquea/wes_chd_ukbb
def main(args):
    # Init Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    # vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(),
                                vep_csq_field=args.csq_field)

    if args.split_multi_allelic:
        # split multi-allelic variants
        mt = hl.split_multi_hts(mt)

        # split/annotate fields in the info field (use allele index )
        mt = mt.annotate_rows(info=mt.info.annotate(
            **{field: mt.info[field][mt.a_index - 1]
               for field in INFO_FIELDS}))

    # parse/annotate the CSQ field in a different structure
    tb_csq = mt.rows()
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field]))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and
    # the values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts)
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    if all(
        [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]):
        tb_csq = (tb_csq.annotate(csq_raw=hl.cond(
            tb_csq.was_split,
            tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                             a_index)), tb_csq.csq_raw)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(
        ht=tb_csq,
        csq_array='csq_raw',
    )

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
        Consequence=tb_csq.vep.Consequence.split('&')[0])))

    # Parse the protein DOMAIN field
    if 'DOMAINS' in vep_fields:
        tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
            DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS']))))

    # drop redundant/temp fields
    tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500))

    # print fields overview
    tb_csq.describe()

    # write table as HailTable to disk
    # (tb_csq
    # .write(output=args.tb_output_path,
    #        overwrite=args.overwrite)
    # )

    output_path = get_variant_qc_ht_path(part='vep_vqsr',
                                         split=args.split_multi_allelic)
    tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(f'{output_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()
コード例 #31
0
def download_data(data_dir):
    global _data_dir, _mt
    _data_dir = data_dir or os.environ.get(
        'HAIL_BENCHMARK_DIR') or '/tmp/hail_benchmark_data'
    logging.info(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), [
        'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht',
        'table_10M_par_100.ht', 'table_10M_par_10.ht',
        'gnomad_dp_simulation.mt', 'many_strings_table.ht',
        'many_ints_table.ht', 'sim_ukb.bgen'
    ])
    if not all(os.path.exists(file) for file in files):
        hl.init()  # use all cores

        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        logging.info('downloading profile.vcf.bgz...')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
            vcf)
        logging.info('done downloading profile.vcf.bgz.')
        logging.info('importing profile.vcf.bgz...')
        hl.import_vcf(vcf, min_partitions=16).write(os.path.join(
            _data_dir, 'profile.mt'),
                                                    overwrite=True)
        logging.info('done importing profile.vcf.bgz.')

        logging.info('writing 10M row partitioned tables...')

        ht = hl.utils.range_table(
            10_000_000,
            1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1)
                              for i in range(5)})
        ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'),
                           overwrite=True)
        ht = ht.naive_coalesce(100).checkpoint(os.path.join(
            _data_dir, 'table_10M_par_100.ht'),
                                               overwrite=True)
        ht.naive_coalesce(10).write(os.path.join(_data_dir,
                                                 'table_10M_par_10.ht'),
                                    overwrite=True)
        logging.info('done writing 10M row partitioned tables.')

        logging.info('creating gnomad_dp_simulation matrix table...')
        mt = hl.utils.range_matrix_table(n_rows=250_000,
                                         n_cols=1_000,
                                         n_partitions=32)
        mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3))
        mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'),
                 overwrite=True)
        logging.info('done creating gnomad_dp_simulation matrix table.')

        logging.info('downloading many_strings_table.tsv.bgz...')
        mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz')
        mst_ht = os.path.join(_data_dir, 'many_strings_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz',
            mst_tsv)
        logging.info('done downloading many_strings_table.tsv.bgz.')
        logging.info('importing many_strings_table.tsv.bgz...')
        hl.import_table(mst_tsv).write(mst_ht, overwrite=True)
        logging.info('done importing many_strings_table.tsv.bgz.')

        logging.info('downloading many_ints_table.tsv.bgz...')
        mit_tsv = os.path.join(_data_dir, 'many_ints_table.tsv.bgz')
        mit_ht = os.path.join(_data_dir, 'many_ints_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_ints_table.tsv.bgz',
            mit_tsv)
        logging.info('done downloading many_ints_table.tsv.bgz.')
        logging.info('importing many_ints_table.tsv.bgz...')
        hl.import_table(mit_tsv,
                        types={
                            'idx': 'int',
                            **{f'i{i}': 'int'
                               for i in range(5)},
                            **{f'array{i}': 'array<int>'
                               for i in range(2)}
                        }).write(mit_ht, overwrite=True)
        logging.info('done importing many_ints_table.tsv.bgz.')

        bgen = 'sim_ukb.bgen'
        sample = 'sim_ukb.sample'
        logging.info(f'downloading {bgen}...')
        local_bgen = os.path.join(_data_dir, bgen)
        local_sample = os.path.join(_data_dir, sample)
        urlretrieve(
            f'https://storage.googleapis.com/hail-common/benchmark/{bgen}',
            local_bgen)
        urlretrieve(
            f'https://storage.googleapis.com/hail-common/benchmark/{sample}',
            local_sample)
        logging.info(f'done downloading {bgen}...')
        logging.info(f'indexing {bgen}...')
        hl.index_bgen(local_bgen)
        logging.info(f'done indexing {bgen}.')

        hl.stop()
    else:
        logging.info('all files found.')