Beispiel #1
0
def startTestHailContext():
    global _initialized
    if not _initialized:
        url = os.environ.get('HAIL_TEST_SERVICE_BACKEND_URL')
        if url:
            hl.init(master='local[2]', min_block_size=0, quiet=True, _backend=hl.backend.ServiceBackend(url))
        else:
            hl.init(master='local[2]', min_block_size=0, quiet=True)
        _initialized = True
Beispiel #2
0
def initialize(cores, log, n_iter):
    assert not _initialized
    hl.init(master=f'local[{cores}]', quiet=True, log=log)

    global _n_iter
    _n_iter = n_iter

    download_data()

    # make JVM do something to ensure that it is fresh
    hl.utils.range_table(1)._force_count()
Beispiel #3
0
def main():
    parser = argparse.ArgumentParser(description="Driver for hail's gVCF combiner")
    parser.add_argument('--sample-map', help='path to the sample map (must be filesystem local)',
                        required=True)
    parser.add_argument('--sample-file', help='path to a file containing a line separated list'
                                              'of samples to combine (must be filesystem local)')
    parser.add_argument('--tmp-path', help='path to folder for temp output (can be a cloud bucket)',
                        default='/tmp')
    parser.add_argument('--out-file', '-o', help='path to final combiner output', required=True)
    parser.add_argument('--summarize', help='if defined, run summarize, placing the rows table '
                                            'of the output at the argument value')
    parser.add_argument('--json', help='json to use for the import of the gVCFs'
                                       '(must be filesystem local)', required=True)
    args = parser.parse_args()
    samples = build_sample_list(args.sample_map, args.sample_file)
    with open(args.json) as j:
        json = j.read()
    hl.init(default_reference=DEFAULT_REF,
            log='/hail-joint-caller-' + time.strftime('%Y%m%d-%H%M') + '.log')
    run_combiner(samples, json, args.out_file, args.tmp_path, summary_path=args.summarize,
                 overwrite=True)
Beispiel #4
0
def main():
    hl.init()
    data = hl.import_vcf(
        os.path.join(PROJECT_DIR, 'data/chr22_1000_missing.vcf'))
    labels = hl.import_table(os.path.join(PROJECT_DIR,
                                          'data/chr22-labels.csv'),
                             delimiter=',',
                             types={
                                 '22_16050408': 'float64'
                             }).key_by('sample')

    mt = data.annotate_cols(pheno=labels[data.s])
    y = mt.pheno['22_16050408']
    x = mt.GT.n_alt_alleles()
    mt = matrix_table_source('random_forest_model/x', x)
    check_entry_indexed('random_forest_model/x', x)
    mts = mt._select_all(col_exprs=dict(y=y),
                         row_exprs=dict(),
                         col_key=[],
                         entry_exprs=dict(e=x))

    mts.write(os.path.join(
        PROJECT_DIR, 'src/test/data/hail/chr22_1000_missing-22_16050408.vds'),
              overwrite=True)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    gnomad_loadings_path = f'{output}/gnomad_loadings_90k_liftover.ht'

    # liftover and get variants
    ht_gnomad_loadings = hl.read_table(GNOMAD_V2_LOADINGS)
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(
        'gs://hail-common/references/grch37_to_grch38.over.chain.gz', rg38)
    ht_gnomad_loadings_liftover = ht_gnomad_loadings.annotate(
        liftover=hl.liftover(ht_gnomad_loadings.locus,
                             'GRCh38',
                             include_strand=False),
        old_locus=ht_gnomad_loadings.locus,
    )
    ht_gnomad_loadings_liftover = ht_gnomad_loadings_liftover.key_by(
        locus=ht_gnomad_loadings_liftover.liftover)

    # save gnomad loadings
    ht_gnomad_loadings_liftover.write(gnomad_loadings_path, overwrite=True)
Beispiel #6
0
def main(args):
    hl.init(
        log=f"/variant_filter.log",
        tmp_dir="gs://ccdg-30day-temp/",
        default_reference="GRCh38",
    )
    # TODO: This flag can be removed if this error is no longer relevant: log4j:ERROR Failed to flush writer,
    #  java.io.IOException: No space left on device when trying to write a densified MT from VDS
    hl._set_flags(distributed_scan_comb_op="1")

    if args.update_ccdg_exome_interval_table:
        ccdg_interval_qc_ht(args.pct_samples_defined, overwrite=True)

    determine_pca_variants(
        autosomes_only=not args.not_autosomes_only,
        bi_allelic_only=not args.not_bi_allelic_only,
        adj_only=not args.not_adj_only,
        snv_only=not args.not_snv_only,
        min_gnomad_v3_ac=args.gnomad_v3_ac_filter,
        high_qual_ccdg_exome_interval_only=not args.not_high_qual_ccdg_interval_only,
        high_qual_ukbb_exome_interval_only=not args.not_high_qual_ukbb_interval_only,
        filter_lcr=not args.not_filter_lcr,
        filter_segdup=not args.not_filter_segdup,
        min_joint_af=args.min_af,
        min_joint_callrate=args.min_callrate,
        min_ccdg_exome_callrate=args.ccdg_exome_callrate_cutoff,
        min_ukbb_exome_callrate=args.ukbb_exome_callrate_cutoff,
        ld_pruning=not args.not_ld_pruning,
        ld_pruning_dataset=args.ld_pruning_dataset,
        ld_r2=args.ld_r2,
        read_per_dataset_checkpoint_if_exists=args.read_per_dataset_checkpoint_if_exists,
        read_pre_ld_prune_ht_checkpoint_if_exists=args.read_pre_ld_prune_ht_checkpoint_if_exists,
        read_pre_ld_prune_mt_checkpoint_if_exists=args.read_pre_ld_prune_mt_checkpoint_if_exists,
        overwrite=args.overwrite,
        filter_washu=args.filter_washu,
    )
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0]
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        p = manhattan_loadings(
            pvals=hl.abs(loadings_ht.loadings[i]),
            locus=loadings_ht.locus,
            title='Loadings of PC ' + str(pc),
            collect_all=True,
        )
        plot_filename = f'{output}/loadings_manhattan_plot_pc' + str(
            pc) + '.png'
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        plot_filename_html = 'loadings_pc' + str(pc) + '.html'
        output_file(plot_filename_html)
        save(p)
        subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                       check=False)
Beispiel #8
0
def main():
    
    use_tabix = True
    
    hl.init(log='/Users/nbaya/Downloads/get_chr_pos.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')
    
    b = hb.Batch(name='get_chr_pos', backend=backend,
                 default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
                 default_storage='2G', default_cpu=1)

    
    paths = get_paths()
    
    for path in paths:
        print(path)
        annotate_chr_pos(b=b,
                         path=path,
                         use_tabix=use_tabix)
    
    b.run(open=True)
    
    backend.close()
Beispiel #9
0
import hail as hl
import logging
import os
import pandas as pd
import re
import subprocess

logging.basicConfig(format='%(asctime)s %(levelname)-8s %(message)s',
                    level=logging.INFO)
logger = logging.getLogger(__name__)

hl.init(log="/dev/null")

#%%
ht = hl.read_table(
    "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht")
ht = ht.filter(ht.release)
release_samples = ht.s.collect()

#%%

sample_ids_gnomad_v3 = hl.hadoop_open(
    "gs://gnomad-bw2/sample_ids_gnomad_v3__20210131.txt").read().split("\n")
release_sample_ids_gnomad_v3 = list(
    set(sample_ids_gnomad_v3) & set(release_samples))  # 39285 samples

sample_ids_gnomad_v3_1 = hl.hadoop_open(
    "gs://gnomad-bw2/sample_ids_gnomad_v3_1__20210131.txt").read().split("\n")
release_sample_ids_gnomad_v3_1 = list(
    set(sample_ids_gnomad_v3_1) & set(release_samples))  # 3526 samples
Beispiel #10
0
import functools as ft
import json
import os
import uvloop
from aiohttp import web

import jwt
import hail as hl
from hail.utils import FatalError
from hail.utils.java import Env, info, scala_object
import hailjwt as hj

uvloop.install()

master = os.environ.get('HAIL_APISERVER_SPARK_MASTER')
hl.init(master=master, min_block_size=0)

app = web.Application()
routes = web.RouteTableDef()


with open(os.environ.get('HAIL_JWT_SECRET_KEY_FILE') or '/jwt-secret/secret-key') as f:
    jwtclient = hj.JWTClient(f.read())


def authenticated_users_only(fun):
    @ft.wraps(fun)
    def wrapped(request, *args, **kwargs):
        encoded_token = request.cookies.get('user')
        if encoded_token is not None:
            try:
Beispiel #11
0
def main(args):
    # Init Hail
    hl.init(default_reference=args.default_ref_genome)

    # Import VEPed VCF file as MatrixTable and get VCF file meta-data
    # vcf_path = args.vcf_vep_path
    mt = hl.import_vcf(path=get_vep_vqsr_vcf_path(), force_bgz=args.force_bgz)

    # getting annotated VEP fields names from VCF-header
    vep_fields = get_vep_fields(vcf_path=get_vep_vqsr_vcf_path(),
                                vep_csq_field=args.csq_field)

    if args.split_multi_allelic:
        # split multi-allelic variants
        mt = hl.split_multi_hts(mt)

        # split/annotate fields in the info field (use allele index )
        mt = mt.annotate_rows(info=mt.info.annotate(
            **{field: mt.info[field][mt.a_index - 1]
               for field in INFO_FIELDS}))

    # parse/annotate the CSQ field in a different structure
    tb_csq = mt.rows()
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.info[args.csq_field]))

    # Convert/annotate all transcripts per variants with a structure of type array<dict<str, str>>.
    # The transcript(s) are represented as a dict<k,v>, where keys are the field names extracted from the VCF header and
    # the values are the current annotated values in the CSQ field.
    tb_csq = (tb_csq.annotate(csq_raw=tb_csq.csq_raw.map(
        lambda x: hl.dict(hl.zip(vep_fields, x.split('[|]'))))))

    # Keep transcript(s) matching with the allele index (only used if variant were split with split_multi_hts)
    # It requires having the flag "ALLELE_NUM" annotated by VEP
    # Apply only were the alleles were split.
    # TODO: Handle exception when the flag "ALLELE_NUM" is not present
    if all(
        [x in list(tb_csq._fields.keys()) for x in ['was_split', 'a_index']]):
        tb_csq = (tb_csq.annotate(csq_raw=hl.cond(
            tb_csq.was_split,
            tb_csq.csq_raw.filter(lambda x: (hl.int(x["ALLELE_NUM"]) == tb_csq.
                                             a_index)), tb_csq.csq_raw)))

    # select and annotate one transcript per variant based on pre-defined rules
    tb_csq = pick_transcript(
        ht=tb_csq,
        csq_array='csq_raw',
    )

    # Expand selected transcript (dict) annotations adding independent fields.
    tb_csq = annotate_from_dict(ht=tb_csq, dict_field='tx', output_filed='vep')

    # Parse the "Consequence" field. Keep only the more severe consequence.
    # Avoid the notation "consequence_1&consequence_2"
    tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
        Consequence=tb_csq.vep.Consequence.split('&')[0])))

    # Parse the protein DOMAIN field
    if 'DOMAINS' in vep_fields:
        tb_csq = (tb_csq.annotate(vep=tb_csq.vep.annotate(
            DOMAINS=vep_protein_domain_ann_expr(tb_csq.vep['DOMAINS']))))

    # drop redundant/temp fields
    tb_csq = (tb_csq.drop('csq_raw', 'tx').repartition(500))

    # print fields overview
    tb_csq.describe()

    # write table as HailTable to disk
    # (tb_csq
    # .write(output=args.tb_output_path,
    #        overwrite=args.overwrite)
    # )

    output_path = get_variant_qc_ht_path(part='vep_vqsr',
                                         split=args.split_multi_allelic)
    tb_csq = (tb_csq.checkpoint(output=output_path, overwrite=args.overwrite))

    if args.write_to_file:
        # write table to disk as a BGZ-compressed TSV file
        (tb_csq.export(f'{output_path}.tsv.bgz'))

    # Stop Hail
    hl.stop()
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(cohort_sample_codes=hl.if_else(
        scores.s.contains('snp_chip'), 'snp_chip', 'tob_wgs'))
    labels = scores.cohort_sample_codes
    hover_fields = dict([('s', scores.s)])

    # get percent variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        print(f'PC{pc1 + 1} vs PC{pc2 + 1}')
        p = hl.plot.scatter(
            scores.scores[pc1],
            scores.scores[pc2],
            label=labels,
            title='TOB-WGS + TOB SNP Chip',
            xlabel='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) + '%)',
            ylabel='PC' + str(pc2 + 1) + ' (' + str(variance[pc2]) + '%)',
            hover_fields=hover_fields,
        )
        plot_filename = output_path('pc' + str(pc2) + '.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        html = file_html(p, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # Get partner sample information
    sample_names = scores.s.collect()

    def sample_type(sample_name):
        if sample_name.endswith('snp_chip'):
            partner_name = re.sub('_snp_chip', '', sample_name)
            tech = 'snp'
        else:
            partner_name = sample_name + '_snp_chip'
            tech = 'wgs'

        if partner_name in sample_names:
            prefix = 'dual_'
        else:
            prefix = ''

        return prefix + tech

    # save as html
    labels = list(map(sample_type, sample_names))
    html = pd.DataFrame({
        'sample_name': sample_names,
        'sample_tech': labels
    }).to_html()
    plot_filename_html = output_path(f'sample_technology.html', 'web')
    with hl.hadoop_open(plot_filename_html, 'w') as f:
        f.write(html)

    # plot
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Reprocessed Sample Projection',
            x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
            '%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path('technology_type_pc' + str(pc2) + '.png',
                                    'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'technology_type_pc{pc2}.html',
                                         'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
#!/usr/bin/env python3

import hail as hl
hl.init(tmp_dir='/net/scratch/people/plggosborcz', spark_conf={'spark.driver.memory': '90G', 'spark.executor.memory': '90G'}, default_reference='GRCh38') 

europeans = hl.import_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/europeans', delimiter = "\t", no_header = True)
to_keep = europeans['f0'].collect()



controls = hl.read_matrix_table('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/gnomad.genomes.v3.1.2.hgdp_1kg_subset_sparse.mt')
controls = controls.filter_cols(hl.literal(to_keep).contains(controls.s))
controls.write('/net/archive/groups/plggneuromol/imdik-zekanowski-sportwgs/data/external-data/1kg/1kg-europeans-sparse.ht')
Beispiel #14
0
def main(args):
    hl.init(default_reference="GRCh38", log="/variant_histograms.log")

    logger.info("Loading ANNOTATIONS_HISTS dictionary...")
    if not file_exists(annotation_hists_path()):
        raise DataException(
            "Annotation hists JSON file not found. Need to create this JSON before running script!"
        )

    with hl.hadoop_open(annotation_hists_path()) as a:
        ANNOTATIONS_HISTS = json.loads(a.read())

    # NOTE: histogram aggregations on these metrics are done on the entire callset (not just PASS variants), on raw data
    ht = hl.read_table(release_ht_path(public=False))
    ht = ht.select(freq=ht.freq, info=ht.info.select(*ANNOTATIONS_HISTS))

    inbreeding_bin_ranges = ANNOTATIONS_HISTS["InbreedingCoeff"]

    # Remove InbreedingCoeff from ANNOTATIONS_HISTS. It requires different ranges by allele frequency and needs to be
    # handled differently. It is stored as a dictionary in annotation_hists_path
    ANNOTATIONS_HISTS.remove("InbreedingCoeff")

    logger.info("Getting info annotation histograms...")
    hist_ranges_expr = get_annotations_hists(ht, ANNOTATIONS_HISTS, LOG10_ANNOTATIONS)

    # Evaluate minimum and maximum values for each metric of interest to help determine the bounds of the hists
    # NOTE: Run this first, then update values in annotation_hists_path JSON as necessary
    if args.determine_bounds:
        logger.info(
            "Evaluating minimum and maximum values for each metric of interest. Maximum values capped at 1e10."
        )
        minmax_dict = {}
        for metric in ANNOTATIONS_HISTS:
            minmax_dict[metric] = hl.struct(
                min=hl.agg.min(ht.info[metric]),
                max=hl.if_else(
                    hl.agg.max(ht.info[metric]) < 1e10,
                    hl.agg.max(ht.info[metric]),
                    1e10,
                ),
            )
        minmax = ht.aggregate(hl.struct(**minmax_dict))
        logger.info(f"Metrics bounds: {minmax}")
    else:
        logger.info(
            "Aggregating hists over ranges defined in the annotation_hists_path JSON file. --determine_bounds can "
            "be used to help define these ranges..."
        )
        hists = ht.aggregate(
            hl.array(
                [
                    hist_expr.annotate(metric=hist_metric)
                    for hist_metric, hist_expr in hist_ranges_expr.items()
                ]
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.QUALapprox),
                            *ANNOTATIONS_HISTS["QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="QUALapprox-" + x[0]))
            )
            .extend(
                hl.array(
                    hl.agg.group_by(
                        create_frequency_bins_expr(AC=ht.freq[1].AC, AF=ht.freq[1].AF),
                        hl.agg.hist(
                            hl.log10(ht.info.AS_QUALapprox),
                            *ANNOTATIONS_HISTS["AS_QUALapprox"],
                        ),
                    )
                ).map(lambda x: x[1].annotate(metric="AS_QUALapprox-" + x[0]))
            ),
            _localize=False,
        )

        # Defining hist range and bins for allele frequency groups because they needed different ranges
        ht = ht.annotate(af_bin=create_frequency_bins_expr_inbreeding(AF=ht.freq[1].AF))
        inbreeding_hists = [
            ht.aggregate(
                hl.agg.filter(
                    ht.af_bin == x,
                    hl.agg.hist(ht.info.InbreedingCoeff, *inbreeding_bin_ranges[x],),
                )
            ).annotate(metric="InbreedingCoeff" + "-" + x)
            for x in inbreeding_bin_ranges
        ]

        hists = hl.eval(hl.json(hists))
        inbreeding_hists = hl.eval(hl.json(inbreeding_hists))

        # Note: The following removes '}' from the JSON stored in hists and '{' from the JSON stored in
        # inbreeding_hists then joins them together to be written out as a single JSON
        hists = hists[:-1] + "," + inbreeding_hists[1:]

        logger.info("Writing output")
        with hl.hadoop_open(qual_hists_json_path(), "w") as f:
            f.write(hists)
Beispiel #15
0
        #ht_out = ht_out.annotate(**covs[ht_out.key])
        ht_comb = ht_out.select(*p_max.keys(),
                                age=ht_out.phenotypes.age,
                                sex=ht_out.phenotypes.sex,
                                pheno=ht_out.phenotypes[pheno])

        output_location = args.ss_clump_prefix + pheno + '_apcdr_PRS'
        #ht_comb.describe()
        #ht_comb.write(output_location + '.ht', overwrite=args.overwrite)
        #ht_comb = hl.read_table(output_location + '.ht')
        ht_comb.export(output_location + '.txt.bgz')

    end = time.time()
    print("Success! Job was completed in %s" %
          time.strftime("%H:%M:%S", time.gmtime(end - start)))


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument(
        '--ss_clump_prefix',
        default='gs://apcdr/prs_sumstats_clump/apcdr_ukb_10k_eur_holdout_meta/'
    )
    parser.add_argument('--ss_suffix', default='.meta.bgz')
    parser.add_argument('--chr_pos_ref_alt_p_beta',
                        default='CHR,POS,A1,A2,P,BETA')
    parser.add_argument('--overwrite', action='store_true')
    args = parser.parse_args()

    hl.init(log='/prs.log')
    main(args)
    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)


if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory
    hl.init(sc=sc, tmp_dir=tmp_dir, default_reference="GRCh38")
    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    #####################################################################
    ###################### INPUT DATA  ##############################
    #####################################################################
    parser = argparse.ArgumentParser()
    # Read the matrixtable, chrX and chrY should be included
    input_params = parser.add_argument_group("Input parameters")
    input_params.add_argument(
        "--matrixtable",
Beispiel #17
0
def hailthread(cond1, q, cond2, qcm, inputDir, outputDir, qaws_size):

    #Load id_conversion file
    #table_idconv=hl.import_table('id_conversion')

    #Load markers files
    #table_makers_pos=hl.import_table('800k_to_extract_indexed2.txt',delimiter=':',no_header=True,impute=True)
    #table_markers_all=hl.import_table('800k_to_extract_indexed_alleles_gt2.txt',delimiter=':',no_header=True,impute=True)

    #cut -f 1 -d',' 800k_to_extract_indexed2.txt > interval_table
    #awk -F':' '{print $1"\t"$2"\t"$2}' interval_table > interval_table2

    hl.init()
    cond1.acquire()
    while not an_item_is_available(q):
        #print("Thread hail to sleep")
        #time.sleep(300)
        print("Thread hail to wait")

        cond1.wait()

    file = get_an_available_item(q)
    print("Thread hail get item " + file)
    qaws_size = qaws_size - 1
    cond1.release()

    interval_table = hl.import_locus_intervals('interval_table2',
                                               reference_genome='GRCh38')

    while file != "END":
        fileParts = file.split("/")[-1]
        fileName = fileParts.replace(".vcf.gz", "").replace(".gvcf.gz", "")
        chrName = fileName.split("_")[-3]
        #myFNAL=fileName.split("\\.")
        #myTempId=myFNAL[0]
        #Load gVCF file
        #data=hl.import_vcf("/mnt/vol1/java/gel_test.vcf",force_bgz=True,reference_genome='GRCh38')
        #data=hl.import_vcf("/mnt/vol1/java/gel_mainProgramme_aggV2_chr10_129040437_131178399.vcf.gz",force_bgz=True,reference_genome='GRCh38')
        try:

            #Extract INFO fields

            data = hl.import_vcf(inputDir + "/" + fileParts,
                                 force_bgz=True,
                                 reference_genome='GRCh38',
                                 drop_samples=True)
            #Filters PASS
            if chrName != "chrY":
                data = data.filter_rows(data.filters.size() > 0, keep=False)
            #Multiallelic
            data = hl.split_multi_hts(data)
            #Join with markers
            data_filtered = data.filter_rows(
                hl.is_defined(interval_table[data.locus]))

            data_sr = data_filtered.select_rows(
                data_filtered.info.medianDepthAll,
                data_filtered.info.medianDepthNonMiss,
                data_filtered.info.medianGQ, data_filtered.info.missingness,
                data_filtered.info.completeGTRatio, data_filtered.info.ABratio,
                data_filtered.info.MendelSite, data_filtered.info.AN,
                data_filtered.info.AC, data_filtered.info.AC_Hom,
                data_filtered.info.AC_Het)

            ht = data_sr.make_table()
            ht.export(outputDir + "/" + fileName + "_INFO.tsv")
            os.system("sed -i 's/\[//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("sed -i 's/]//g' " + outputDir + "/" + fileName +
                      "_INFO.tsv")
            os.system("cat " + outputDir + "/" + fileName +
                      "_INFO.tsv | grep -v locus " + " >> " + outputDir +
                      "/INFO_" + chrName)
            os.system("rm " + inputDir + "/" + fileParts)

            cond2.acquire()
            print("Thread hail make item available " + fileName)
            make_an_item_available(qcm, file)
            cond2.notify_all()
            cond2.release()
        except FatalError as e:
            print("Exception2 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except AssertionError as e:
            print("Exception3 in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

        except Exception as e:
            print("Exception in file:" + file)
            os.system("rm " + inputDir + "/" + fileParts)

            #raise Exception
        cond1.acquire()
        while not an_item_is_available(q):
            #print("Thread hail to sleep")
            #time.sleep(300)
            print("Thread hail to wait")
            cond1.wait()

        file = get_an_available_item(q)
        print("Thread hail get item " + file)
        qaws_size = qaws_size - 1
        cond1.release()
    time.sleep(300)
    cond2.acquire()
    print("Thread hail make END available")
    make_an_item_available(qcm, "END")
    cond2.notify_all()
    cond2.release()
Beispiel #18
0
def startTestHailContext():
    global _initialized
    if not _initialized:
        hail.init(master='local[2]', min_block_size=0, quiet=True)
        _initialized = True
Beispiel #19
0
import hail as hl

from hail.utils.java import Env, info

import logging
import flask

hl.init()

app = flask.Flask('hail-apiserver')

@app.route('/execute', methods=['POST'])
def execute():
    code = flask.request.json
    
    info(f'execute: {code}')
    
    jir = Env.hail().expr.ir.IRParser.parse_value_ir(code, {}, {})
    
    typ = hl.HailType._from_java(jir.typ())
    value = Env.hail().expr.ir.Interpret.interpretPyIR(code, {}, {})

    result = {
        'type': str(typ),
        'value': value
    }
    
    info(f'result: {result}')
    
    return flask.jsonify(result)
Beispiel #20
0
import json
import hail as hl

gvcfs = ['gs://hail-ci/gvcfs/HG00096.g.vcf.gz',
         'gs://hail-ci/gvcfs/HG00268.g.vcf.gz']
hl.init(default_reference='GRCh38')
parts = [
    {'start': {'locus': {'contig': 'chr20', 'position': 17821257}},
     'end': {'locus': {'contig': 'chr20', 'position': 18708366}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 18708367}},
     'end': {'locus': {'contig': 'chr20', 'position': 19776611}},
     'includeStart': True,
     'includeEnd': True},
    {'start': {'locus': {'contig': 'chr20', 'position': 19776612}},
     'end': {'locus': {'contig': 'chr20', 'position': 21144633}},
     'includeStart': True,
     'includeEnd': True},
]
parts_str = json.dumps(parts)
vcfs = hl.import_vcfs(gvcfs, parts_str)
Beispiel #21
0
 def handler(signum, frame):
     global _timeout_state
     _timeout_state = True
     hl.stop()
     hl.init(**_init_args)
     raise BenchmarkTimeoutError()
Beispiel #22
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
Beispiel #23
0
 def test_init_hail_context_twice(self):
     hl.init(hl.spark_context(), idempotent=True) # Should be no error
# coding: utf-8
import hail as hl
import hail.expr.aggregators as agg
import numpy as np
import matplotlib.pyplot as plt
from math import log, isnan
from pprint import pprint
import time
hl.init()  # Initialize Hail and Spark.

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# ## key step
# ### 1. extract pca info, transform it to dataframe
# ### 2. build linear regression model, predict y and get y residuals
# ### 3. store y residuals in hail MatrixTable
# ### 4. run gwas and compare time

# Import a PLINK dataset (BED, BIM, FAM) as a MatrixTable
vds = hl.import_plink('gs://ukb_testdata/maf_0.01_10.bed',
                      'gs://ukb_testdata/maf_0.01_10.bim',
                      'gs://ukb_testdata/maf_0.01_10.fam')

# Import delimited text file (text table) as Table
# import phenotype
table = (hl.import_table('gs://ukb_testdata/sleep_duration.tsv',
                         delimiter='\t',
Beispiel #25
0
def main(args):
    hl.init(log="/variant_qc_evaluation.log")

    if args.create_bin_ht:
        create_bin_ht(
            args.model_id,
            args.n_bins,
        ).write(
            get_score_bins(args.model_id, aggregated=False).path,
            overwrite=args.overwrite,
        )

    if args.run_sanity_checks:
        ht = get_score_bins(args.model_id, aggregated=False).ht()
        logger.info("Running sanity checks...")
        print(
            ht.aggregate(
                hl.struct(
                    was_biallelic=hl.agg.counter(~ht.was_split),
                    has_biallelic_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_bin)),
                    was_singleton=hl.agg.counter(ht.singleton),
                    has_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.singleton_bin)),
                    was_biallelic_singleton=hl.agg.counter(ht.singleton
                                                           & ~ht.was_split),
                    has_biallelic_singleton_rank=hl.agg.counter(
                        hl.is_defined(ht.biallelic_singleton_bin)),
                )))

    if args.create_aggregated_bin_ht:
        logger.warning(
            "Use only workers, it typically crashes with preemptibles")
        create_aggregated_bin_ht(args.model_id).write(
            get_score_bins(args.model_id, aggregated=True).path,
            overwrite=args.overwrite,
        )

    if args.extract_truth_samples:
        logger.info(f"Extracting truth samples from MT...")
        mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True,
                              remove_hard_filtered_samples=False)

        mt = mt.filter_cols(
            hl.literal([v["s"]
                        for k, v in TRUTH_SAMPLES.items()]).contains(mt.s))
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

        # Checkpoint to prevent needing to go through the large table a second time
        mt = mt.checkpoint(
            get_checkpoint_path("truth_samples", mt=True),
            overwrite=args.overwrite,
        )

        for truth_sample in TRUTH_SAMPLES:
            truth_sample_mt = mt.filter_cols(
                mt.s == TRUTH_SAMPLES[truth_sample]["s"])
            # Filter to variants in truth data
            truth_sample_mt = truth_sample_mt.filter_rows(
                hl.agg.any(truth_sample_mt.GT.is_non_ref()))
            truth_sample_mt.naive_coalesce(args.n_partitions).write(
                get_callset_truth_data(truth_sample).path,
                overwrite=args.overwrite,
            )

    if args.merge_with_truth_data:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating a merged table with callset truth sample and truth data for {truth_sample}..."
            )

            # Load truth data
            mt = get_callset_truth_data(truth_sample).mt()
            truth_hc_intervals = TRUTH_SAMPLES[truth_sample][
                "hc_intervals"].ht()
            truth_mt = TRUTH_SAMPLES[truth_sample]["truth_mt"].mt()
            truth_mt = truth_mt.key_cols_by(
                s=hl.str(TRUTH_SAMPLES[truth_sample]["s"]))

            # Remove low quality sites
            info_ht = get_info(split=True).ht()
            mt = mt.filter_rows(~info_ht[mt.row_key].AS_lowqual)

            ht = create_truth_sample_ht(mt, truth_mt, truth_hc_intervals)
            ht.write(
                get_callset_truth_data(truth_sample, mt=False).path,
                overwrite=args.overwrite,
            )

    if args.bin_truth_sample_concordance:
        for truth_sample in TRUTH_SAMPLES:
            logger.info(
                f"Creating binned concordance table for {truth_sample} for model {args.model_id}"
            )
            ht = get_callset_truth_data(truth_sample, mt=False).ht()

            info_ht = get_info(split=True).ht()
            ht = ht.filter(
                ~info_ht[ht.key].AS_lowqual
                & ~hl.is_defined(telomeres_and_centromeres.ht()[ht.locus]))

            logger.info("Filtering out low confidence regions and segdups...")
            ht = filter_low_conf_regions(
                ht,
                filter_lcr=True,
                # TODO: Uncomment when we have decoy path
                filter_decoy=False,  # True,
                filter_segdup=True,
            )

            logger.info(
                "Loading HT containing RF or VQSR scores annotated with a bin based on the rank of score..."
            )
            metric_ht = get_score_bins(args.model_id, aggregated=False).ht()
            ht = ht.filter(hl.is_defined(metric_ht[ht.key]))

            ht = ht.annotate(score=metric_ht[ht.key].score)

            ht = compute_binned_truth_sample_concordance(
                ht, metric_ht, args.n_bins)
            ht.write(
                get_binned_concordance(args.model_id, truth_sample).path,
                overwrite=args.overwrite,
            )
Beispiel #26
0
    get_expr_for_variant_loftee_flag_flag,
    get_expr_for_genes_with_loftee_flag_flag,
    get_expr_for_ref_allele,
    get_expr_for_start_pos,
    get_expr_for_variant_id,
    get_expr_for_vep_sorted_transcript_consequences_array,
    get_expr_for_xpos,
)

p = argparse.ArgumentParser()
p.add_argument("--input-url", help="URL of gnomAD 2.1 flattened Hail table to export", required=True)
p.add_argument("--output-url", help="URL to write shaped Hail table to", required=True)
p.add_argument("--subset", help="Filter variants to this chrom:start-end range")
args = p.parse_args()

hl.init(log="/tmp/hail.log")

ds = hl.read_table(args.input_url)

# The globals in the flattened Hail table cause a serialization error during export to ES.
ds = ds.select_globals()

if args.subset:
    subset_interval = hl.parse_locus_interval(args.subset)
    ds = ds.filter(subset_interval.contains(ds.locus))

####################
# Top level fields #
####################

# These fields remain at the top level
Beispiel #27
0
def main(args):
    hl.init(log='/frequency_data_generation.log', default_reference='GRCh38')

    logger.info("Reading sparse MT and metadata table...")
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    meta_ht = meta.ht().select('pop', 'sex', 'project_id', 'release', 'sample_filters')

    if args.test:
        logger.info("Filtering to chr20:1-1000000")
        mt = hl.filter_intervals(mt, [hl.parse_locus_interval('chr20:1-1000000')])

    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    logger.info("Annotating sparse MT with metadata...")
    mt = mt.annotate_cols(meta=meta_ht[mt.s])
    mt = mt.filter_cols(mt.meta.release)
    samples = mt.count_cols()
    logger.info(f"Running frequency table prep and generation pipeline on {samples} samples")

    logger.info("Computing adj and sex adjusted genotypes.")
    mt = mt.annotate_entries(
        GT=adjusted_sex_ploidy_expr(mt.locus, mt.GT, mt.meta.sex),
        adj=get_adj_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Densify-ing...")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) > 1)

    logger.info("Setting het genotypes at sites with >1% AF (using v3.0 frequencies) and > 0.9 AB to homalt...")
    # hotfix for depletion of homozygous alternate genotypes
    # Using v3.0 AF to avoid an extra frequency calculation
    # TODO: Using previous callset AF works for small incremental changes to a callset, but we need to revisit for large increments
    freq_ht = freq.versions["3"].ht()
    freq_ht = freq_ht.select(AF=freq_ht.freq[0].AF)

    mt = mt.annotate_entries(
        GT=hl.cond(
            (freq_ht[mt.row_key].AF > 0.01)
            & mt.GT.is_het()
            & (mt.AD[1] / mt.DP > 0.9),
            hl.call(1, 1),
            mt.GT,
        )
    )

    logger.info("Calculating InbreedingCoefficient...")
    # NOTE: This is not the ideal location to calculate this, but added here to avoid another densify
    mt = mt.annotate_rows(InbreedingCoeff=bi_allelic_site_inbreeding_expr(mt.GT))

    logger.info("Generating frequency data...")
    mt = annotate_freq(
        mt,
        sex_expr=mt.meta.sex,
        pop_expr=mt.meta.pop
    )

    # Select freq, FAF and popmax
    faf, faf_meta = faf_expr(mt.freq, mt.freq_meta, mt.locus, POPS_TO_REMOVE_FOR_POPMAX)
    mt = mt.select_rows(
        'InbreedingCoeff',
        'freq',
        faf=faf,
        popmax=pop_max_expr(mt.freq, mt.freq_meta, POPS_TO_REMOVE_FOR_POPMAX)
    )
    mt = mt.annotate_globals(faf_meta=faf_meta)

    # Annotate quality metrics histograms, as these also require densifying
    mt = mt.annotate_rows(
        **qual_hist_expr(mt.GT, mt.GQ, mt.DP, mt.AD)
    )

    logger.info("Writing out frequency data...")
    if args.test:
        mt.rows().write("gs://gnomad-tmp/gnomad_freq/chr20_1_1000000_freq.ht", overwrite=True)
    else:
        mt.rows().write(freq.path, overwrite=args.overwrite)
Beispiel #28
0
#conda activate hail
#cd /Users/mzekavat/opt/anaconda3/envs/hail
#hailctl dataproc start mz02 --master-machine-type n1-highmem-16 --worker-machine-type n1-highmem-16 --worker-boot-disk-size 200 --num-workers 3 --num-preemptible-workers 3 --master-boot-disk-size 100 --region us-east1 --zone us-east1-d --requester-pays-allow-all --properties "spark:spark.driver.memory=90G,spark:spark.driver.maxResultSize=50G,spark:spark.kryoserializer.buffer.max=1G,spark:spark.task.maxFailures=20,spark:spark.driver.extraJavaOptions=-Xss4M,spark:spark.executor.extraJavaOptions=-Xss4M,spark:spark.speculation=true"
#hailctl dataproc connect mz02 notebook --zone us-east1-d --region us-east1
#hailctl dataproc submit --zone us-east1-d --region us-east1 mz02 ~/Documents/Broad_2015_17/Python_Scripts_Hail/CHIP/Merge_SomaticVCFS_15000_30000.py
import hail as hl
import hail.expr.aggregators as agg
hl.init(default_reference="GRCh38")
import numpy as np
import pandas as pd
from collections import Counter
from math import log, isnan
from pprint import pprint
import time
from bokeh.io import show, output_notebook
from bokeh.layouts import gridplot
output_notebook()

recoding_dict = {f"{i + 1}": f"chr{i + 1}" for i in range(22)}
recoding_dict['X'] = 'chrX'
recoding_dict['Y'] = 'chrY'

files = hl.import_table('gs://maryam_lipids/UKBB_CHIP/filenames.txt',
                        impute=True,
                        no_header=True)
files_list = [row['f0'] for row in files.select(files.f0).collect()]

for num in range(1, 10000):
    print(num)
    filenamev2 = files_list[num].strip()
    mt = hl.import_vcf(filenamev2,
Beispiel #29
0
def main(args):
    hl.init(log="/variant_qc_random_forest.log")

    if args.list_rf_runs:
        logger.info(f"RF runs:")
        pretty_print_runs(get_rf_runs(rf_run_path()))

    if args.annotate_for_rf:
        ht = create_rf_ht(
            impute_features=args.impute_features,
            adj=args.adj,
            n_partitions=args.n_partitions,
            checkpoint_path=get_checkpoint_path("rf_annotation"),
        )
        ht.write(
            get_rf_annotations(args.adj).path, overwrite=args.overwrite,
        )
        logger.info(f"Completed annotation wrangling for random forests model training")

    if args.train_rf:
        model_id = f"rf_{str(uuid.uuid4())[:8]}"
        rf_runs = get_rf_runs(rf_run_path())
        while model_id in rf_runs:
            model_id = f"rf_{str(uuid.uuid4())[:8]}"

        ht, rf_model = train_rf(
            get_rf_annotations(args.adj).ht(),
            fp_to_tp=args.fp_to_tp,
            num_trees=args.num_trees,
            max_depth=args.max_depth,
            no_transmitted_singletons=args.no_transmitted_singletons,
            no_inbreeding_coeff=args.no_inbreeding_coeff,
            vqsr_training=args.vqsr_training,
            vqsr_model_id=args.vqsr_model_id,
            filter_centromere_telomere=args.filter_centromere_telomere,
            test_intervals=args.test_intervals,
        )

        ht = ht.checkpoint(
            get_rf_training(model_id=model_id).path, overwrite=args.overwrite,
        )

        logger.info("Adding run to RF run list")
        rf_runs[model_id] = get_run_data(
            input_args={
                "transmitted_singletons": None
                if args.vqsr_training
                else not args.no_transmitted_singletons,
                "adj": args.adj,
                "vqsr_training": args.vqsr_training,
                "filter_centromere_telomere": args.filter_centromere_telomere,
            },
            test_intervals=args.test_intervals,
            features_importance=hl.eval(ht.features_importance),
            test_results=hl.eval(ht.test_results),
        )

        with hl.hadoop_open(rf_run_path(), "w") as f:
            json.dump(rf_runs, f)

        logger.info("Saving RF model")
        save_model(
            rf_model, get_rf_model_path(model_id=model_id), overwrite=args.overwrite,
        )

    else:
        model_id = args.model_id

    if args.apply_rf:
        logger.info(f"Applying RF model {model_id}...")
        rf_model = load_model(get_rf_model_path(model_id=model_id))
        ht = get_rf_training(model_id=model_id).ht()
        features = hl.eval(ht.features)
        ht = apply_rf_model(ht, rf_model, features, label=LABEL_COL)

        logger.info("Finished applying RF model")
        ht = ht.annotate_globals(rf_model_id=model_id)
        ht = ht.checkpoint(
            get_rf_result(model_id=model_id).path, overwrite=args.overwrite,
        )

        ht_summary = ht.group_by(
            "tp", "fp", TRAIN_COL, LABEL_COL, PREDICTION_COL
        ).aggregate(n=hl.agg.count())
        ht_summary.show(n=20)
Beispiel #30
0
def main(args):
    data_type = "exomes" if args.exomes else "genomes"
    hl.init(log=f"/ccdg_sample_qc_{data_type}.log")
    # gcloud compute scp wlu-m:/hard_filter_genomes.log .
    if args.sample_qc:
        compute_sample_qc(data_type).write(
            get_ccdg_results_path(data_type=data_type, result="sample_qc_all"),
            overwrite=args.overwrite,
        )

    if args.impute_sex:
        compute_sex(data_type).write(
            get_ccdg_results_path(data_type=data_type, result="sex"),
            overwrite=args.overwrite,
        )
    # elif args.reannotate_sex:
    #     reannotate_sex(
    #         args.min_cov,
    #         (args.upper_x, (args.lower_xx, args.upper_xx), args.lower_xxx),
    #         ((args.lower_y, args.upper_y), args.lower_yy),
    #     ).write(
    #         get_ccdg_results_path(data_type=data_type, result="sex"),
    #         overwrite=args.overwrite,
    #     )
    ##### Wait for more information
    # if args.compute_hard_filters:
    #     compute_hard_filters(args.min_cov).write(
    #         hard_filtered_samples.path, overwrite=args.overwrite
    #     )

    if args.run_pc_relate or args.reannotate_relatedness:
        if args.run_pc_relate:
            logger.warning(
                "PC-relate requires SSDs and doesn't work with preemptible workers!"
            )
            relatedness_ht = compute_relatedness(
                data_type,
                overwrite=args.overwrite,
            )
        else:
            relatedness_ht = hl.read_table(
                get_ccdg_results_path(data_type=data_type, result="relatedness")
            ).checkpoint(
                "gs://ccdg/tmp/relatedness_ht_checkpoint.ht", overwrite=True
            )  # Copy HT to temp location to overwrite annotation
        relatedness_ht = annotate_relatedness(
            relatedness_ht,
            first_degree_kin_thresholds=tuple(args.first_degree_kin_thresholds),
            second_degree_min_kin=args.second_degree_kin_cutoff,
            ibd0_0_max=args.ibd0_0_max,
        )
        relatedness_ht.write(
            get_ccdg_results_path(data_type=data_type, result="relatedness"),
            overwrite=args.overwrite,
        )

    if args.compute_related_samples_to_drop:
        relatedness_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="relatedness")
        )
        related_samples_to_remove = hl.maximal_independent_set(
            relartedness_ht.i, pairs.j, False
        ).checkpoint(
            get_ccdg_results_path(data_type=data_type, result="related_samples"),
            overwrite=args.overwrite,
        )

    if args.update_variant_filtered_pca_mt:
        pca_var_ht = hl.read_table(get_pca_variants_path())
        mt = hl.vds.to_dense_mt(get_qc_vds(data_type, split=True))
        mt = mt.filter_rows(hl.is_defined(pca_var_ht[mt.row_key])).checkpoint(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True),
            overwrite=args.overwrite,
            _read_if_exists=(not args.overwrite),
        )

    if args.run_pc_project:
        ## TODO: Rank samples and hard filter samples
        mt = hl.read_matrix_table(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True)
        )

        pca_loadings = hl.read_table(path_to_gnomad_loadings)

        pca_ht = hl.experimental.pc_project(
            mt.GT,
            pca_loadings.loadings,
            pca_loadings.pca_af,
        )

        pca_ht.checkpoint(
            get_ccdg_results_path(
                data_type=data_type, result="gnomad_pc_project_scores"
            ),
            overwrite=args.overwrite,
        )

        # related_ht = hl.read_table(
        #     get_ccdg_results_path(data_type=data_type, result="related_samples")
        # )
        #
        # related_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=True)
        # pca_mt = mt.filter_cols(hl.is_defined(related_mt[mt.col_key]), keep=False)

        # pca_ht = hl.experimental.pc_project(
        #     pca_mt.GT, pca_loadings.loadings, pca_loadings.pca_af
        # )
        # pca_mt = pca_mt.annotate_cols(scores=pca_ht[pca_mt.col_key].scores)
        #
        # related_ht = hl.experimental.pc_project(
        #     related_mt.GT, pca_loadings.loadings, pca_loadings.pca_af
        # )
        # related_mt = related_mt.annotate_cols(
        #     scores=related_ht[related_mt.col_key].scores
        # )

    if args.assign_pops:
        with hl.hadoop_open(
            path_to_gnomad_rf,
            "rb",
        ) as f:
            fit = pickle.load(f)

        # Reduce the scores to only those used in the RF model, this was 6 for v2 and 16 for v3.1
        n_pcs = fit.n_features_
        pca_ht = hl.read_table(
            get_ccdg_results_path(
                data_type=data_type, result="gnomad_pc_project_scores"
            )
        )
        pca_ht = pca_ht.annotate(scores=pca_ht.scores[:n_pcs])
        pop_ht, rf_model = assign_population_pcs(
            pca_ht,
            pc_cols=pca_ht.scores,
            fit=fit,
        )

        pop_ht = pop_ht.checkpoint(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
            overwrite=args.overwrite,
            _read_if_exists=not args.overwrite,
        )
        pop_ht.transmute(
            **{f"PC{i + 1}": pop_ht.pca_scores[i] for i in range(n_pcs)}
        ).export(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment")[:-2]
            + "tsv"
        )

        with hl.hadoop_open(
            get_ccdg_results_path(data_type=data_type, result="pop_RF_fit")[:-2]
            + "pickle",
            "wb",
        ) as out:
            pickle.dump(rf_model, out)

    if args.calculate_inbreeding:
        qc_mt = hl.read_matrix_table(
            get_pca_variants_path(ld_pruned=True, data=f"ccdg_{data_type}", mt=True)
        )
        pop_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
        )
        qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop)
        qc_mt = qc_mt.annotate_rows(
            call_stats_by_pop=hl.agg.group_by(
                qc_mt.pop, hl.agg.call_stats(qc_mt.GT, qc_mt.alleles)
            )
        )
        inbreeding_ht = (
            qc_mt.annotate_cols(
                inbreeding=hl.agg.inbreeding(
                    qc_mt.GT, qc_mt.call_stats_by_pop[qc_mt.pop].AF[1]
                )
            )
            .cols()
            .select("inbreeding")
        )
        inbreeding_ht.write(
            get_ccdg_results_path(data_type=data_type, result="inbreeding"),
            overwrite=args.overwrite,
        )

    if args.apply_stratified_filters or args.apply_regressed_filters:
        filtering_qc_metrics = args.filtering_qc_metrics.split(",")
        sample_qc_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="sample_qc_bi_allelic")
        )
        pc_scores = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pc_scores")
        )
        sample_qc_ht = sample_qc_ht.select(
            scores=pc_scores[sample_qc_ht.key]["scores"],
        )
        pop_ht = hl.read_table(
            get_ccdg_results_path(data_type=data_type, result="pop_assignment"),
        )

        if "inbreeding" in filtering_qc_metrics:
            inbreeding_ht = hl.read_table(
                get_ccdg_results_path(data_type=data_type, result="inbreeding")
            )[sample_qc_ht.key]
            sample_qc_ht = sample_qc_ht.annotate(
                inbreeding=inbreeding_ht.inbreeding.f_stat
            )

        if args.apply_regressed_filters:
            n_pcs = args.regress_n_pcs
            residuals_ht = compute_qc_metrics_residuals(
                ht=sample_qc_ht,
                pc_scores=sample_qc_ht.scores[:n_pcs],
                qc_metrics={
                    metric: sample_qc_ht[metric] for metric in filtering_qc_metrics
                },
            )
            residuals_ht = residuals_ht.filter(
                hl.is_missing(hard_filtered_samples.ht()[residuals_ht.key])
            )
            stratified_metrics_ht = compute_stratified_metrics_filter(
                ht=residuals_ht,
                qc_metrics=dict(residuals_ht.row_value),
                metric_threshold={
                    "n_singleton_residual": (math.inf, 8.0),
                    "r_het_hom_var_residual": (math.inf, 4.0),
                },
            )

            residuals_ht = residuals_ht.annotate(
                **stratified_metrics_ht[residuals_ht.key]
            )
            residuals_ht = residuals_ht.annotate_globals(
                **stratified_metrics_ht.index_globals(),
                n_pcs=n_pcs,
            )
        else:
            logger.info(
                "Computing stratified QC metrics filters using metrics: "
                + ", ".join(filtering_qc_metrics)
            )
            sample_qc_ht = sample_qc_ht.annotate(qc_pop=pop_ht[sample_qc_ht.key].pop)
            # TODO: compute hard-filtered samples
            sample_qc_ht = sample_qc_ht.filter(
                hl.is_missing(hard_filtered_samples.ht()[sample_qc_ht.key])
            )
            stratified_metrics_ht = compute_stratified_metrics_filter(
                sample_qc_ht,
                qc_metrics={
                    metric: sample_qc_ht[metric] for metric in filtering_qc_metrics
                },
                strata={"qc_pop": sample_qc_ht.qc_pop},
                metric_threshold={"n_singleton": (4.0, 8.0)},
            )
Beispiel #31
0
def tabix(b, ss_path, out_dir):
    r'''
    tabix's a bgz file with gcloud path `path` using Batch `b`
    '''
    fname = ss_path.split('/')[-1]
    f = b.read_input(ss_path)
    j = b.new_job(name=fname.split('.')[0])
    j.command(f'tabix -s 1 -b 2 -e 2 -c chr {f}'
              )  # treat header (which begins with "chr") as a comment
    j.command(f'mv {f}.tbi {j.ofile}')
    b.write_output(j.ofile, f'{out_dir}/{fname}.tbi')


if __name__ == "__main__":
    hl.init(log='/Users/nbaya/Downloads/tabix_sumstats.log')
    backend = hb.ServiceBackend(billing_project='ukb_diverse_pops',
                                bucket='ukbb-diverse-temp-30day/nb-batch-tmp')

    b = hb.Batch(
        name='tabix',
        backend=backend,
        default_image='gcr.io/ukbb-diversepops-neale/nbaya_tabix:latest',
        default_storage='100M',  # works with 2G
        default_cpu=1)

    #    sumstats_dir = f'{bucket}/sumstats_flat_files'
    #    sumstats_dir = f'{ldprune_dir}/export_results/update'
    #    sumstats_dir = f'{ldprune_dir}/loo/sumstats/batch1'
    sumstats_dir = f'{ldprune_dir}/variant_qc'
    print(f'\nUsing sumstats from {sumstats_dir}')
from hail.expr.expressions import *
from hail.expr.expressions import Expression
from hail.typecheck import *
from hail import Table
import hail

from google.cloud import storage
storage.Client()
client = storage.Client()
import gcsfs
fs = gcsfs.GCSFileSystem(project='your-project')
bucket = client.get_bucket('your-bucket')

import hail as hl
import hail.expr.aggregators as agg
hl.init()

#read mt file
mt = hl.read_matrix_table(
    "gs://1k_genome/1000-genomes/VDS-of-all/ALL.chr.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.mt"
)
#print(mt.count()) (39706715, 1092)

#filter MAF
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
#print(mt.count()) (13404583, 1092)

#filter only SNPs
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
#print(mt.count()) (12194564, 1092)
Beispiel #33
0
def main():
    parser = argparse.ArgumentParser()
    # reference args
    parser.add_argument(
        '--ref-dirname',
        default=
        'gs://hgdp-1kg/hgdp_tgp/datasets_for_others/lindo/ds_without_outliers/'
    )
    parser.add_argument('--ref-basename', default='unrelated')
    parser.add_argument(
        '--ref-info',
        default=
        'gs://hgdp-1kg/hgdp_tgp/gwaspy_pca_ref/hgdp_1kg_sample_info.unrelateds.pca_outliers_removed.with_project.tsv'
    )
    parser.add_argument('--reference', type=str, default='GRCh38')
    parser.add_argument('--pca-type',
                        type=str,
                        default='normal',
                        choices=['normal', 'project', 'joint'])

    # data args
    parser.add_argument('--data-dirname', type=str, required=True)
    parser.add_argument('--data-basename', type=str, required=True)
    parser.add_argument('--input-type',
                        type=str,
                        required=True,
                        choices=['vcf', 'plink', 'hail'])

    # filter args
    parser.add_argument('--maf',
                        type=float,
                        default=0.05,
                        help='include only SNPs with MAF >= NUM in PCA')
    parser.add_argument('--hwe',
                        type=float,
                        default=1e-3,
                        help='include only SNPs with HWE >= NUM in PCA')
    parser.add_argument('--geno',
                        type=float,
                        default=0.98,
                        help='include only SNPs with call-rate > NUM')
    parser.add_argument(
        '--ld-cor',
        type=float,
        default=0.2,
        choices=range(0, 1),
        metavar="[0.0-1.0]",
        help=
        'Squared correlation threshold (exclusive upper bound). Must be in the range [0.0, 1.0]'
    )
    parser.add_argument(
        '--ld-window',
        type=int,
        default=250000,
        help='Window size in base pairs (inclusive upper bound)')
    parser.add_argument('--npcs',
                        type=int,
                        default=20,
                        help='Number of PCs to use')
    parser.add_argument('--relatedness-method',
                        type=str,
                        default='pc_relate',
                        choices=['pc_relate', 'ibd', 'king'],
                        help='Method to use for the inference of relatedness')
    parser.add_argument('--relatedness-thresh',
                        type=float,
                        default=0.98,
                        help='Threshold value to use in relatedness checks')
    parser.add_argument(
        '--prob',
        type=float,
        default=0.8,
        help=
        'Minimum probability of belonging to a given population for the population to be set'
    )
    parser.add_argument('--out-dir', type=str, required=True)

    args = parser.parse_args()

    if not args.prob:
        print(f'No prob value specified, {args.prob} will be used')

    hl.init(default_reference=args.reference)

    pca(ref_dirname=args.ref_dirname,
        ref_basename=args.ref_basename,
        ref_info=args.ref_info,
        reference=args.reference,
        pca_type=args.pca_type,
        input_type=args.input_type,
        data_dirname=args.data_dirname,
        data_basename=args.data_basename,
        maf=args.maf,
        hwe=args.hwe,
        call_rate=args.geno,
        ld_cor=args.ld_cor,
        ld_window=args.ld_window,
        n_pcs=args.npcs,
        relatedness_method=args.relatedness_method,
        relatedness_thresh=args.relatedness_thresh,
        prob_threshold=args.prob,
        out_dir=args.out_dir)

    print('\nDone running PCA')
Beispiel #34
0
    )
    ht = ht.annotate(
        validated_denovo_inheritance=ht_val_filtered[ht.key].inheritance)

    ht.write(
        f'{lustre_dir}/variant_qc/models/{run_hash}_rf_result_FINAL_for_RANKING_100_trios.ht',
        overwrite=True)


if __name__ == "__main__":
    # need to create spark cluster first before intiialising hail
    sc = pyspark.SparkContext()
    # Define the hail persistent storage directory

    hl.init(sc=sc,
            tmp_dir=lustre_dir,
            local_tmpdir=lustre_dir,
            default_reference="GRCh38")

    # s3 credentials required for user to access the datasets in farm flexible compute s3 environment
    # you may use your own here from your .s3fg file in your home directory
    hadoop_config = sc._jsc.hadoopConfiguration()

    hadoop_config.set("fs.s3a.access.key", credentials["mer"]["access_key"])
    hadoop_config.set("fs.s3a.secret.key", credentials["mer"]["secret_key"])

    ################################

    #################################

    main()
Beispiel #35
0
import hail as hl
import sys
import timeit

start = timeit.default_timer()

chrom = str(sys.argv[1])

hl.init(log='/hail.log', min_block_size=2048, default_reference='GRCh38')

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# define files
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# input
vds_ldpruned_common_file = 'gs://ccdg-qc-multi/vds/qced/' + chrom + '/ldpruned_common.vds'
vds_1kg_file = 'gs://ccdg-qc-multi/data/1000genomes/vds/hail2_ALL.GRCh38.genotypes.20170504.vds'
mhc_chr8inv_file = 'gs://ccdg-qc-multi/data/MHC_invchr8_longLDreg_liftover_to_GRCh38.txt'
rel_exclusion_file = 'gs://ccdg-qc-multi/out/king/' + chrom + '/ibd_greater_0884_' + chrom + '.txt'
samples_to_keep_file = 'gs://ccdg-qc-multi/qc_measures/' + chrom + '/01_sample_qc_keep.txt'

# output
pca_value_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_values.tsv'
pca_score_file = 'gs://ccdg-qc-multi/qc_measures/pca/' + chrom + '/pca_scores.tsv'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# read data
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

## interval list
#mhc_chr8inv = hl.import_table(mhc_chr8inv_file, no_header=True).key_by('f0')
    sys.exit(1)
else:
    filter_constraint = True

if not os.path.isfile(args.vep_config):
    logger.error(f"Could not find vep config file {args.vep_config}")
    sys.exit(1)

# Prepare output path
if not os.path.exists(os.path.abspath(os.path.dirname(args.output))):
    os.makedirs(os.path.abspath(os.path.dirname(args.output)))

# Set hail temporary path
hl.init(
    idempotent=True,
    tmp_dir=args.tmp_dir,
    log=os.path.join(args.tmp_dir, 'hail.log'),
)

##
# Main script
#
logger.info(f"Reading pedigree file {args.fam}")
pedigree = hl.Pedigree.read(args.fam)

logger.info(f"Importing vcf file {args.vcf}")
data = hl.import_vcf(args.vcf,
                     call_fields=['GT'],
                     skip_invalid_loci=True,
                     force_bgz=True)
data = hl.split_multi_hts(data)
Beispiel #37
0
"""

Annotate variant HailTable with allelic frequencies from different (external) sources
(e.g., gnomad exomes and genomes)

"""

import hail as hl

from utils.data_utils import (get_gnomad_genomes_v3_af_ht, get_bonn_af_ht,
                              get_germ_af_ht, get_rum_af_ht,
                              get_vep_annotation_ht)

from utils.generic import current_date

hl.init(default_reference='GRCh38')

nfs_dir = 'file:///home/ubuntu/data'
nfs_tmp = 'file:///home/ubuntu/data/tmp'
hdfs_dir = 'hdfs://spark-master:9820/dir/hail_data'

## import variant table
variant_ht = get_vep_annotation_ht()

## import af tables
# In-hause german allelic frequencies (Tuebingen)
ht_ger_af = get_germ_af_ht()

# In-hause german allelic frequencies (Bonn)
bonn_af = get_bonn_af_ht()
Beispiel #38
0
def main(args):
    hl.init()
    data_type = "genomes" if args.genomes else "exomes"

    if not args.skip_write_qc_mt:
        logger.info("Importing data...")
        # 1h40 for exomes, 3h20 for genomes
        mt = get_gnomad_data(
            data_type, raw=True, split=False
        )  # NOTE: using full calls since hardcalls doesn't exist at this stage
        logger.info(
            "Filtering to bi-allelic, high-callrate, common SNPs for sample QC..."
        )
        mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                            & hl.is_snp(mt.alleles[0], mt.alleles[1])
                            & (hl.agg.mean(mt.GT.n_alt_alleles()) / 2 > 0.001)
                            & (hl.agg.fraction(hl.is_defined(mt.GT)) > 0.99))
        mt.annotate_cols(callrate=hl.agg.fraction(hl.is_defined(
            mt.GT))).naive_coalesce(5000).write(qc_mt_path(data_type),
                                                overwrite=args.overwrite)
    qc_mt = hl.read_matrix_table(qc_mt_path(data_type))

    logger.info("Importing metadata...")
    meta_ht = hl.import_table(qc_meta_path(data_type),
                              impute=True,
                              types={
                                  'age': hl.tfloat64
                              }).key_by('s')
    qc_mt = qc_mt.annotate_cols(**meta_ht[qc_mt.s])

    logger.info("Inferring sex...")
    qc_ht = annotate_sex(qc_mt,
                         qc_temp_data_prefix(data_type),
                         male_threshold=0.8 if args.genomes else 0.6).cols()
    # Flag Klinefelter's individuals and samples with sex aneuploidies
    if args.exomes:
        qc_ht = qc_ht.annotate(
            ambiguous_sex=((qc_ht.f_stat >= 0.5) &
                           (hl.is_defined(qc_ht.normalized_y_coverage) &
                            (qc_ht.normalized_y_coverage <= 0.1))) |
            (hl.is_missing(qc_ht.f_stat)) |
            ((qc_ht.f_stat >= 0.4) & (qc_ht.f_stat <= 0.6) &
             (hl.is_defined(qc_ht.normalized_y_coverage) &
              (qc_ht.normalized_y_coverage > 0.1))),
            sex_aneuploidy=(qc_ht.f_stat < 0.4)
            & hl.is_defined(qc_ht.normalized_y_coverage) &
            (qc_ht.normalized_y_coverage > 0.1))
    else:
        qc_ht = qc_ht.annotate(ambiguous_sex=hl.is_missing(qc_ht.is_female))

    logger.info("Annotating samples failing hard filters...")
    if args.exomes:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.sex_aneuploidy,
            "sex_aneuploidy").when(qc_ht.is_female, "female").default("male"))
    else:
        sex_expr = (hl.case().when(qc_ht.ambiguous_sex, "ambiguous_sex").when(
            qc_ht.is_female, "female").default("male"))
    qc_ht = qc_ht.annotate(
        hard_filters=make_hard_filters_expr(qc_ht, data_type),
        perm_filters=make_perm_filters_expr(qc_ht, data_type),
        sex=sex_expr,
        data_type=data_type).key_by('data_type', 's')
    qc_ht.write(qc_ht_path(data_type, 'hard_filters'),
                overwrite=args.overwrite)

    # Export annotations to make rank list for relatedness (in final sample QC)
    if args.exomes:
        colnames = ['internal', 'project_id', 'pct_bases_20x', 'perm_filters']
    else:
        colnames = ['pcr_free', 'mean_dp', 'perm_filters']
    rank_ht = qc_ht.filter(hl.len(qc_ht.hard_filters) == 0,
                           keep=True).select(*colnames)
    (rank_ht.annotate(releasable=(
        hl.len(rank_ht.perm_filters) == 0)).drop('perm_filters').export(
            rank_annotations_path(data_type)))

    # Check numbers:
    qc_ht = hl.read_table(qc_ht_path(data_type, 'hard_filters'))
    sample_count = qc_ht.count()
    checkpoint1a = qc_ht.aggregate(
        hl.agg.count_where(hl.len(qc_ht['hard_filters']) == 0))
    checkpoint1b = qc_ht.aggregate(
        hl.agg.count_where((hl.len(qc_ht['hard_filters']) == 0)
                           & (hl.len(qc_ht.perm_filters) == 0)))
    logger.info('{} samples found before filtering'.format(sample_count))
    logger.info('{} samples found after checkpoint 1a (hard filters)'.format(
        checkpoint1a))
    logger.info(
        '{} samples found after checkpoint 1b (hard filters + permissions)'.
        format(checkpoint1b))