Ejemplo n.º 1
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Ejemplo n.º 2
0
def export_qced_file(mt: hl.MatrixTable,
                     out_dir: str,
                     basename: str,
                     export_type='hail'):
    outname = basename + '_qced'

    if export_type == 'hail':
        mt.write('{}GWASpy/Preimp_QC/{}.mt'.format(out_dir, outname),
                 overwrite=True)

    elif export_type == 'plink':
        hl.export_plink(dataset=mt,
                        output='{}GWASpy/Preimp_QC/{}'.format(
                            out_dir, outname),
                        fam_id=mt.fam_id,
                        ind_id=mt.s,
                        pat_id=mt.pat_id,
                        mat_id=mt.mat_id,
                        is_female=mt.is_female,
                        pheno=mt.is_case,
                        varid=mt.rsid)

    else:
        hl.export_vcf(mt,
                      '{}GWASpy/Preimp_QC/{}.vcf.bgz'.format(out_dir, outname))
Ejemplo n.º 3
0
    def test_import_plink_a1_major(self):
        mt = get_dataset()
        bfile = '/tmp/sample_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s)

        def get_data(a2_reference):
            mt_imported = hl.import_plink(bfile + '.bed',
                                          bfile + '.bim',
                                          bfile + '.fam',
                                          a2_reference=a2_reference)
            return (hl.variant_qc(mt_imported).rows().key_by('rsid'))

        a2 = get_data(a2_reference=True)
        a1 = get_data(a2_reference=False)

        j = (a2.annotate(a1_alleles=a1[a2.rsid].alleles,
                         a1_vqc=a1[a2.rsid].variant_qc).rename({
                             'variant_qc':
                             'a2_vqc',
                             'alleles':
                             'a2_alleles'
                         }))

        self.assertTrue(
            j.all((j.a1_alleles[0] == j.a2_alleles[1])
                  & (j.a1_alleles[1] == j.a2_alleles[0])
                  & (j.a1_vqc.n_not_called == j.a2_vqc.n_not_called)
                  & (j.a1_vqc.n_het == j.a2_vqc.n_het)
                  & (j.a1_vqc.homozygote_count[0] ==
                     j.a2_vqc.homozygote_count[1])
                  & (j.a1_vqc.homozygote_count[1] ==
                     j.a2_vqc.homozygote_count[0])))
Ejemplo n.º 4
0
 def test_import_plink_empty_bim(self):
     mt = get_dataset().drop_rows()
     bfile = '/tmp/test_empty_bim'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError,
                                 ".bim file does not contain any variants"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Ejemplo n.º 5
0
def to_plink(pops: list,
             subsets_dir,
             mt,
             ht_sample,
             bfile_path,
             export_varid: bool = True,
             overwrite=False):
    r'''
    Exports matrix table to PLINK2 files
    NOTE: These files will need to split up by chromosome before plink_clump.py
    can be run. 
    '''
    assert 'GT' in mt.entry, "mt must have 'GT' as an entry field"
    assert mt.GT.dtype == hl.tcall, "entry field 'GT' must be of type `Call`"

    if not overwrite and all([
            hl.hadoop_exists(f'{bfile_path}.{suffix}')
            for suffix in ['bed', 'bim']
    ]):
        print(f'\nPLINK .bed and .bim files already exist for {bfile_path}')
        print(bfile_path)
    else:
        print(f'Saving to bfile prefix {bfile_path}')
        mt_sample = mt.annotate_rows(varid=hl.str(mt.locus) + ':' +
                                     mt.alleles[0] + ':' + mt.alleles[1])
        mt_sample = mt_sample.filter_cols(hl.is_defined(
            ht_sample[mt_sample.s]))
        hl.export_plink(dataset=mt_sample,
                        output=bfile_path,
                        ind_id=mt_sample.s,
                        varid=mt_sample.varid)  # varid used to be rsid
Ejemplo n.º 6
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.split_multi_hts(tob_wgs)
    tob_wgs_path = output_path('tob_wgs_plink')
    hl.export_plink(tob_wgs, tob_wgs_path, ind_id=tob_wgs.s)
Ejemplo n.º 7
0
def main():

    # Parse args
    args = parse_args()

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(args.chainfile, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    # Load plink
    mt = hl.import_plink(bed=args.in_plink + '.bed',
                         bim=args.in_plink + '.bim',
                         fam=args.in_plink + '.fam',
                         reference_genome='GRCh37',
                         min_partitions=args.min_partitions)

    # # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=args.out_plink)

    return 0
Ejemplo n.º 8
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                                      a2_reference=True, reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
Ejemplo n.º 9
0
    def test_import_plink_contig_recoding_w_reference(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Ejemplo n.º 10
0
    def test_import_plink(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Ejemplo n.º 11
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        # permute columns so not in alphabetical order!
        import random
        indices = list(range(mt.count_cols()))
        random.shuffle(indices)
        mt = mt.choose_cols(indices)

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command(["plink", "--vcf", split_vcf_file,
                     "--make-bed", "--out", plink_output,
                     "--const-fid", "--keep-allele-order"])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command(["plink", "--bfile", plink_output,
                     "--bmerge", hl_output, "--merge-mode",
                     "6", "--out", merge_output])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
Ejemplo n.º 12
0
    def test_export_plink(self):
        vcf_file = resource('sample.vcf')
        mt = hl.split_multi_hts(hl.import_vcf(vcf_file, min_partitions=10))

        split_vcf_file = uri_path(new_temp_file())
        hl_output = uri_path(new_temp_file())
        plink_output = uri_path(new_temp_file())
        merge_output = uri_path(new_temp_file())

        hl.export_vcf(mt, split_vcf_file)
        hl.export_plink(mt, hl_output)

        run_command([
            "plink", "--vcf", split_vcf_file, "--make-bed", "--out",
            plink_output, "--const-fid", "--keep-allele-order"
        ])

        data = []
        with open(uri_path(plink_output + ".bim")) as file:
            for line in file:
                row = line.strip().split()
                row[1] = ":".join([row[0], row[3], row[5], row[4]])
                data.append("\t".join(row) + "\n")

        with open(plink_output + ".bim", 'w') as f:
            f.writelines(data)

        run_command([
            "plink", "--bfile", plink_output, "--bmerge", hl_output,
            "--merge-mode", "6", "--out", merge_output
        ])

        same = True
        with open(merge_output + ".diff") as f:
            for line in f:
                row = line.strip().split()
                if row != ["SNP", "FID", "IID", "NEW", "OLD"]:
                    same = False
                    break

        self.assertTrue(same)
Ejemplo n.º 13
0
    def test_export_plink(self):
        ds = self.get_dataset()

        hl.export_plink(ds, '/tmp/plink_example', id=ds.s)

        hl.export_plink(ds, '/tmp/plink_example2', id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, is_case=False)

        hl.export_plink(ds, '/tmp/plink_example3', id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, quant_pheno=hl.float64(hl.len(ds.s)))

        self.assertRaises(ValueError,
                          lambda: hl.export_plink(ds, '/tmp/plink_example', is_case=True, quant_pheno=0.0))

        self.assertRaises(ValueError, lambda: hl.export_plink(ds, '/tmp/plink_example', foo=0.0))

        self.assertRaises(TypeError, lambda: hl.export_plink(ds, '/tmp/plink_example', is_case=0.0))
Ejemplo n.º 14
0
    def test_import_plink_a1_major(self):
        mt = get_dataset()
        bfile = '/tmp/sample_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s)

        def get_data(a2_reference):
            mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim',
                                          bfile + '.fam', a2_reference=a2_reference)
            return (hl.variant_qc(mt_imported)
                    .rows()
                    .key_by('rsid'))

        a2 = get_data(a2_reference=True)
        a1 = get_data(a2_reference=False)

        j = (a2.annotate(a1_alleles=a1[a2.rsid].alleles, a1_vqc=a1[a2.rsid].variant_qc)
             .rename({'variant_qc': 'a2_vqc', 'alleles': 'a2_alleles'}))

        self.assertTrue(j.all((j.a1_alleles[0] == j.a2_alleles[1]) &
                              (j.a1_alleles[1] == j.a2_alleles[0]) &
                              (j.a1_vqc.n_not_called == j.a2_vqc.n_not_called) &
                              (j.a1_vqc.n_het == j.a2_vqc.n_het) &
                              (j.a1_vqc.homozygote_count[0] == j.a2_vqc.homozygote_count[1]) &
                              (j.a1_vqc.homozygote_count[1] == j.a2_vqc.homozygote_count[0])))
Ejemplo n.º 15
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([
            mt.locus.contig,
            hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]
        ], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr),
                            pat_id=hl.null(hl.tstr),
                            mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool),
                            is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed',
                                      bfile + '.bim',
                                      bfile + '.fam',
                                      a2_reference=True,
                                      reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
Ejemplo n.º 16
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Ejemplo n.º 17
0
                    (mt['gnomad2_popmax_gnomad.AF'] <= rare_threshold) &
                    (mt['exac_af_adj'] <= rare_threshold) &
                    (mt.symbol == 'HEY2') & (mt.csq_type != 'SYN'),
                    keep=True)

# Annotate sex info
mt = (mt.annotate_cols(is_female=hl.case().when(mt.f_stat <= 0.4, True).when(
    mt.f_stat >= 0.6, False).or_missing()))

# Export plink files
date = time.strftime("%d-%m-%Y")
PLINK_OUTPUT_PATH = f'/mnt/nfs/mdatanode/wes10k_resources/wes1k/plink_output/hey2_{date}'

hl.export_plink(dataset=mt,
                output=PLINK_OUTPUT_PATH,
                ind_id=mt.s,
                pheno=mt.isCase,
                is_female=mt.is_female)

# Export useful info (e.g. covariates, annotation)
delimiter = '|'
sample_expr_annotations = dict(
    cases_het=hl.delimit(hl.agg.filter(mt.GT.is_het() & mt.isCase,
                                       hl.agg.collect(mt.s)),
                         delimiter=delimiter),
    cases_hom=hl.delimit(hl.agg.filter(mt.GT.is_hom_var() & mt.isCase,
                                       hl.agg.collect(mt.s)),
                         delimiter=delimiter),
    controls_het=hl.delimit(hl.agg.filter(mt.GT.is_het() & ~mt.isCase,
                                          hl.agg.collect(mt.s)),
                            delimiter=delimiter),
Ejemplo n.º 18
0
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("importing vds...")
vds = hl.read_matrix_table(vds_splitmulti_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Remove rare variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("removing rare variants...")
vds = vds.filter_rows((vds.info.AF[0] > 0.01) & (vds.info.AF[0] < 0.99))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IV. Remove indels
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("removing indels...")

vds = vds.filter_rows(hl.is_indel(vds.alleles[0], vds.alleles[1]) == False)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# V. Write output
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print("writing out...")
hl.export_plink(vds, plink_files_out)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# print Runtime
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stop = timeit.default_timer()

print("runtime: " + str(stop - start) + " seconds")
Ejemplo n.º 19
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s = hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT = hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))
hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)
Ejemplo n.º 20
0
import hail as hl
hl.set_global_seed(0)
mt = hl.balding_nichols_model(n_populations=3, n_variants=(1 << 10), n_samples=4)
mt = mt.key_cols_by(s='s' + hl.str(mt.sample_idx))
mt = mt.annotate_entries(GT=hl.or_missing(hl.rand_bool(0.99), mt.GT))
hl.export_plink(mt, 'balding-nichols-1024-variants-4-samples-3-populations', fam_id='f' + mt.s)
def main(args):
    mt = hl.read_matrix_table(args.matrixtable)
    # ld pruning
    pruned_ht = hl.ld_prune(mt.GT, r2=0.1)
    pruned_mt = mt.filter_rows(hl.is_defined(pruned_ht[mt.row_key]))
    pruned_mt.write(f"{args.output_dir}/mt_ldpruned.mt", overwrite=True)

    # PC relate
    pruned_mt = pruned_mt.select_entries(
        GT=hl.unphased_diploid_gt_index_call(pruned_mt.GT.n_alt_alleles()))

    eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                           k=10,
                                           compute_loadings=False)
    scores.write(f"{args.output_dir}/mt_pruned.pca_scores.ht", overwrite=True)

    relatedness_ht = hl.pc_relate(pruned_mt.GT,
                                  min_individual_maf=0.05,
                                  scores_expr=scores[pruned_mt.col_key].scores,
                                  block_size=4096,
                                  min_kinship=0.05,
                                  statistics='kin2')
    relatedness_ht.write(f"{args.output_dir}/mt_relatedness.ht",
                         overwrite=True)
    pairs = relatedness_ht.filter(relatedness_ht['kin'] > 0.125)
    related_samples_to_remove = hl.maximal_independent_set(pairs.i,
                                                           pairs.j,
                                                           keep=False)
    related_samples_to_remove.write(
        f"{args.output_dir}/mt_related_samples_to_remove.ht", overwrite=True)

    pca_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                   keep=False)
    related_mt = pruned_mt.filter_cols(hl.is_defined(
        related_samples_to_remove[pruned_mt.col_key]),
                                       keep=True)

    variants, samples = pca_mt.count()

    print(f"{samples} samples after relatedness step.")

    # Population pca

    plink_mt = pca_mt.annotate_cols(uid=pca_mt.s).key_cols_by('uid')
    hl.export_plink(plink_mt,
                    f"{args.output_dir}/mt_unrelated.plink",
                    fam_id=plink_mt.uid,
                    ind_id=plink_mt.uid)
    pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
        pca_mt.GT, k=20, compute_loadings=True)
    pca_af_ht = pca_mt.annotate_rows(
        pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
    pca_loadings = pca_loadings.annotate(
        pca_af=pca_af_ht[pca_loadings.key].pca_af)
    pca_scores.write(f"{args.output_dir}/mt_pca_scores.ht", overwrite=True)
    pca_loadings.write(f"{args.output_dir}/mt_pca_loadings.ht", overwrite=True)

    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    print(
        'Projecting population PCs for {} related samples...'.format(samples))
    #related_scores = pc_project(related_mt, pca_loadings)
    #relateds = related_mt.cols()
    #relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    pca_mt.write(f"{args.output_dir}/mt_pca.mt", overwrite=True)
    p = hl.plot.scatter(pca_mt.scores[0],
                        pca_mt.scores[1],
                        title='PCA',
                        xlabel='PC1',
                        ylabel='PC2')
    output_file(f"{args.plot_dir}/pca.html")
    save(p)
Ejemplo n.º 22
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id',
                       'f4': 'is_female', 'f5': 'pheno'}
        bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position',
                       'f3': 'position', 'f4': 'a1', 'f5': 'a2'}

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") &
                                 (fam1.mat_id == "0") & (fam1.is_female == "0") &
                                 (fam1.pheno == "NA")))
        self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) &
                                 (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, pheno=False)
        fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") &
                                 (fam2.mat_id == "nada") & (fam2.is_female == "2") &
                                 (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") &
                                 (fam3.mat_id == "0") & (fam3.is_female == "0") &
                                 (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError, "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
Ejemplo n.º 23
0
#! /usr/bin/python

import sys
import hail as hl

n_samples = int(sys.argv[1])
n_variants = int(sys.argv[2])
path = sys.argv[3]

mt = hl.balding_nichols_model(1, n_samples, n_variants)
mt = mt.key_cols_by(s=hl.str(mt.sample_idx))
mt = mt.annotate_entries(
    GT=hl.unphased_diploid_gt_index_call(hl.rand_bool(0.5) * 2))

hl.export_vcf(mt, path + ".vcf")
hl.export_plink(mt, path)

chimera0 = mt.filter_rows(mt.locus.position < n_variants / 2)
chimera0 = chimera0.filter_cols(chimera0.s == "0")

chimera1 = mt.filter_rows(mt.locus.position >= n_variants / 2)
chimera1 = chimera1.filter_cols(chimera1.s == "1")
chimera1 = chimera1.key_cols_by(s="0")

mt2 = chimera0.union_rows(chimera1)
hl.export_vcf(mt2, path + "-chimera.vcf")
hl.export_plink(mt2, path + "-chimera")
Ejemplo n.º 24
0
def main(args):
    hl.init(log='/sample_qc.log', tmp_dir='hdfs:///pc_relate.tmp/')

    if not args.load_joint_pruned_qc_mt:
        logger.info('Joining exomes and genomes...')
        exome_qc_mt = read_and_pre_process_data(
            qc_mt_path('exomes'), qc_ht_path('exomes', 'hard_filters'))
        genome_qc_mt = read_and_pre_process_data(
            qc_mt_path('genomes'), qc_ht_path('genomes', 'hard_filters'))

        joint_qc_mt = exome_qc_mt.union_cols(
            genome_qc_mt)  # NOTE: this is an inner join on rows
        joint_qc_mt = joint_qc_mt.filter_rows(
            (hl.agg.mean(joint_qc_mt.GT.n_alt_alleles()) / 2 > 0.001)
            & (hl.agg.fraction(hl.is_defined(joint_qc_mt.GT)) > 0.99))
        joint_qc_mt.write(qc_mt_path('joint'), args.overwrite)

        logger.info('LD-pruning joint mt of exomes and genomes...')
        joint_qc_mt = hl.read_matrix_table(qc_mt_path('joint'))
        variants, samples = joint_qc_mt.count()
        logger.info('Pruning {0} variants in {1} samples'.format(
            variants, samples))
        joint_qc_pruned_ht = hl.ld_prune(joint_qc_mt.GT, r2=0.1)
        # Note writing the LD-pruned MT is probably overkill
        # vs using `filter_rows` to filter sites based on the LD-pruned HT.
        joint_qc_pruned_mt = joint_qc_mt.filter_rows(
            hl.is_defined(joint_qc_pruned_ht[joint_qc_mt.row_key]))
        joint_qc_pruned_mt.write(qc_mt_path('joint', ld_pruned=True),
                                 args.overwrite)

    pruned_mt = hl.read_matrix_table(qc_mt_path('joint', ld_pruned=True))
    variants, samples = pruned_mt.count()
    logger.info('{0} samples, {1} variants found in LD-pruned joint MT'.format(
        samples, variants))

    if not args.skip_pc_relate:
        logger.info('Running PCA for PC-Relate...')
        eig, scores, _ = hl.hwe_normalized_pca(pruned_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht',
            args.overwrite)

        logger.info('Running PC-Relate...')
        scores = hl.read_table(
            qc_temp_data_prefix('joint') + '.pruned.pca_scores.ht')
        # NOTE: This needs SSDs on your workers (for the temp files) and no pre-emptibles while the BlockMatrix writes
        relatedness_ht = hl.pc_relate(
            pruned_mt.GT,
            min_individual_maf=0.05,
            scores_expr=scores[pruned_mt.col_key].scores,
            block_size=4096,
            min_kinship=0.05,
            statistics='kin2')
        relatedness_ht.write(relatedness_ht_path, args.overwrite)

    relatedness_ht = hl.read_table(relatedness_ht_path)

    if not args.skip_relatedness:
        infer_ped(GnomADRelatedData('exomes'))
        infer_ped(GnomADRelatedData('genomes'))

        logger.info('Making rank file...')
        rank_table = make_rank_file(rank_annotations_path('joint'))
        logger.info('Finished making rank file...')

        related_samples_to_drop_ranked = get_related_samples_to_drop(
            rank_table, relatedness_ht)
        related_samples_to_drop_ranked.write(
            qc_temp_data_prefix('joint') + '.related_samples_to_drop.ht',
            args.overwrite)

    pca_mt, related_mt = split_mt_by_relatedness(pruned_mt)

    if not args.skip_pop_pca:
        variants, samples = pca_mt.count()
        logger.info('{} samples after removing relateds'.format(samples))
        # TODO: Check that there are no longer any 2nd-degree relateds in the callset by running KING on the output file below
        plink_mt = pca_mt.annotate_cols(uid=pca_mt.data_type + '_' +
                                        pca_mt.s.replace(" ", "_")).replace(
                                            "/", "_").key_cols_by('uid')
        hl.export_plink(plink_mt,
                        qc_temp_data_prefix('joint') + '.unrelated.plink',
                        fam_id=plink_mt.uid,
                        ind_id=plink_mt.uid)

        logger.info(
            'Computing population PCs and annotating with known population labels...'
        )
        pca_evals, pca_scores, pca_loadings = hl.hwe_normalized_pca(
            pca_mt.GT, k=20, compute_loadings=True)
        pca_af_ht = pca_mt.annotate_rows(
            pca_af=hl.agg.mean(pca_mt.GT.n_alt_alleles()) / 2).rows()
        pca_loadings = pca_loadings.annotate(
            pca_af=pca_af_ht[pca_loadings.key].pca_af)
        pca_scores.write(ancestry_pca_scores_ht_path(), args.overwrite)
        pca_loadings.write(ancestry_pca_loadings_ht_path(), args.overwrite)

    pca_scores = hl.read_table(ancestry_pca_scores_ht_path())
    pca_loadings = hl.read_table(ancestry_pca_loadings_ht_path())
    pca_mt = pca_mt.annotate_cols(scores=pca_scores[pca_mt.col_key].scores)

    variants, samples = related_mt.count()
    logger.info(
        'Projecting population PCs for {} related samples...'.format(samples))
    related_scores = pc_project(related_mt, pca_loadings)
    relateds = related_mt.cols()
    relateds = relateds.annotate(scores=related_scores[relateds.key].scores)

    logger.info('Assigning population annotations...')
    pop_colnames = ['related', 'known_pop', 'scores']
    pop_annots_ht = hl.import_table(known_population_annotations,
                                    impute=True).key_by('combined_sample')

    joint_ht = pca_mt.cols().union(relateds)
    joint_ht = joint_ht.annotate(
        known_pop=pop_annots_ht[joint_ht.data_type.replace('s', '') + '_' +
                                joint_ht.s.replace(' ', '_')].known_pop
    )  # FIXME: temporarily doing the underscore thing until known_population_annotations is fixed
    joint_pca_ht = joint_ht.select(*pop_colnames)
    joint_pca_ht, joint_pca_fit = run_assign_population_pcs(
        joint_pca_ht,
        qc_temp_data_prefix('joint') + '.RF_pop_assignments.txt.bgz',
        qc_temp_data_prefix('joint') + '.RF_fit.pkl',
        pcs=list(range(1, 7)))
    joint_ht = joint_ht.annotate(pop=joint_pca_ht[joint_ht.key].pop).select(
        'pop', *pop_colnames)

    # Add special Estonian pop category for genomes
    estonian_ht = (hl.import_table(estonian_batches, impute=True).annotate(
        data_type='genomes').key_by('data_type', 'sample'))
    joint_ht = joint_ht.annotate(batch=estonian_ht[joint_ht.key].batch)
    joint_ht = joint_ht.annotate(qc_pop=hl.case(missing_false=True).when(
        hl.is_defined(joint_ht.pop) & (joint_ht.batch == 1), 'est_b1'
    ).when(hl.is_defined(joint_ht.pop)
           & (joint_ht.batch == 2), 'est_b2').default(joint_ht.pop)).persist()

    # These are keyed by only `s`
    genome_mt = get_gnomad_data('genomes',
                                adj=False,
                                split=False,
                                meta_root=None).select_cols()
    exome_mt = get_gnomad_data('exomes',
                               adj=False,
                               split=False,
                               meta_root=None).select_cols()

    # Population-specific filtering
    if not args.skip_calculate_sample_metrics:
        logger.info(
            'Running mini sample QC for platform- and population-specific filtering...'
        )
        gnomad_sample_qc(exome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('exomes') + '.sample_qc.ht', args.overwrite)
        gnomad_sample_qc(genome_mt).cols().select('sample_qc').write(
            qc_temp_data_prefix('genomes') + '.sample_qc.ht', args.overwrite)
        # TODO: check that the pcr_free annotations are complete once samples are updated from Jessica's spreadsheet

    logger.info('Annotating population and platform assignments...')
    platform_ht = hl.read_table(qc_ht_path('exomes', 'platforms'))
    exome_ht = exome_mt.cols()
    exome_ht = exome_ht.annotate(
        qc_platform=platform_ht.key_by('s')[exome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'exomes').key_by('s')[exome_ht.s])

    genome_meta_ht = hl.read_table(qc_ht_path('genomes', 'hard_filters'))
    genome_ht = genome_mt.cols()
    genome_ht = genome_ht.annotate(
        qc_platform=genome_meta_ht.key_by('s')[genome_ht.s].qc_platform,
        **joint_ht.filter(
            joint_ht.data_type == 'genomes').key_by('s')[genome_ht.s])

    exome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('exomes') + '.sample_qc.ht')
    genome_sample_qc_ht = hl.read_table(
        qc_temp_data_prefix('genomes') + '.sample_qc.ht')

    exome_ht = exome_ht.annotate(**exome_sample_qc_ht[exome_ht.s])
    genome_ht = genome_ht.annotate(**genome_sample_qc_ht[genome_ht.s])

    # For each population, aggregate sample QC metrics and calculate the MAD/mean/stdev
    logger.info(
        'Calculating platform- and population-specific sample QC thresholds...'
    )
    exome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    exome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        exome_ht, exome_qc_metrics, ['qc_pop', 'qc_platform'])
    exome_ht = exome_ht.annotate_globals(
        hl.eval(exome_pop_platform_filter_ht.globals))
    exome_ht = exome_ht.annotate(
        **exome_pop_platform_filter_ht[exome_ht.key]).persist()

    genome_qc_metrics = [
        'n_snp', 'r_ti_tv', 'r_insertion_deletion', 'n_insertion',
        'n_deletion', 'r_het_hom_var'
    ]
    genome_pop_platform_filter_ht = compute_stratified_metrics_filter(
        genome_ht, genome_qc_metrics, ['qc_pop', 'qc_platform'])
    genome_ht = genome_ht.annotate_globals(
        hl.eval(genome_pop_platform_filter_ht.globals))
    genome_ht = genome_ht.annotate(
        **genome_pop_platform_filter_ht[genome_ht.key]).persist()

    # Annotate samples that fail their respective filters
    checkpoint = exome_ht.aggregate(
        hl.agg.count_where(hl.len(exome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} exome samples found passing pop/platform-specific filtering'
    )
    exome_ht.key_by(data_type='exomes',
                    s=exome_ht.s).write(qc_ht_path('exomes', 'pop_platform'),
                                        args.overwrite)

    checkpoint = genome_ht.aggregate(
        hl.agg.count_where(hl.len(genome_ht.pop_platform_filters) == 0))
    logger.info(
        f'{checkpoint} genome samples found passing pop/platform-specific filtering'
    )
    genome_ht.key_by(data_type='genomes', s=genome_ht.s).write(
        qc_ht_path('genomes', 'pop_platform'), args.overwrite)
Ejemplo n.º 25
0
 def test_import_plink_empty_fam(self):
     mt = get_dataset().drop_cols()
     bfile = '/tmp/test_empty_fam'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, "Empty .fam file"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Ejemplo n.º 26
0
def run(
    n_variants: int,
    n_samples: int,
    n_contigs: int,
    n_covars: int,
    n_traits: int,
    output_dir: str,
):
    hl.init()
    mt = get_plink_sim_dataset(n_variants=n_variants,
                               n_samples=n_samples,
                               n_contigs=n_contigs)
    gt = hl.linalg.BlockMatrix.from_entry_expr(
        mt.GT.n_alt_alleles()).to_numpy()
    logger.info(f"Created calls w/ shape {gt.shape}")

    sample_ids = mt.s.collect()
    logger.info(f"Num samples: {len(sample_ids)}")
    logger.info(f"First samples: {sample_ids[:5]}")

    def get_covariates(n, sample_ids, seed=0):
        rs = np.random.RandomState(seed)
        df = pd.DataFrame(
            rs.normal(size=(len(sample_ids), n)),
            columns=[f"X{i:03d}" for i in range(n)],
        )
        df = df.assign(sample_id=sample_ids).set_index("sample_id")
        return df

    df_cov = get_covariates(n_covars, sample_ids)
    logger.info(f"Covariate info:\n{_info(df_cov)}")
    logger.info(f"Covariate head:\n{df_cov.head()}")

    def get_betas(n_traits, gt, df_cov, seed=0):
        rs = np.random.RandomState(seed)
        n_covars = df_cov.shape[1]
        n_variants = gt.shape[0]
        traits = [f"Y{i:04d}" for i in range(n_traits)]

        beta_cov = rs.normal(loc=2.0, scale=1, size=(n_covars, n_traits))
        beta_var = rs.normal(loc=-2.0, scale=1, size=(n_variants, n_traits))
        # Set last half of all betas to 0
        beta_cov[(beta_cov.shape[0] // 2):, :] = 0
        beta_var[(beta_var.shape[0] // 2):, :] = 0

        df_beta_cov = pd.DataFrame(beta_cov,
                                   index=[f"B-{c}" for c in df_cov.columns],
                                   columns=traits)
        df_beta_var = pd.DataFrame(
            beta_var,
            index=[f"B-V{i:07d}" for i in range(n_variants)],
            columns=traits)
        return df_beta_cov, df_beta_var

    df_beta_cov, df_beta_var = get_betas(n_traits, gt, df_cov)

    logger.info(f"Beta cov info:\n{_info(df_beta_cov)}")
    logger.info(f"Beta cov head:\n{df_beta_cov.head()}")

    logger.info(f"Beta var info:\n{_info(df_beta_var)}")
    logger.info(f"Beta var head:\n{df_beta_var.head()}")

    def get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=0.001, seed=0):
        n_variants, n_samples = gt.shape
        assert gt.shape[1] == df_cov.shape[0]
        assert df_beta_var.shape[1] == df_beta_cov.shape[1]
        n_traits = df_beta_var.shape[1]
        rs = np.random.RandomState(seed)
        noise = rs.normal(scale=scale, loc=0, size=(n_samples, n_traits))
        Y = gt.T @ df_beta_var.values + df_cov.values @ df_beta_cov.values + noise
        df_trait = pd.DataFrame(Y,
                                index=df_cov.index,
                                columns=df_beta_cov.columns)
        assert df_trait.notnull().all().all()
        return df_trait

    df_trait = get_traits(gt, df_cov, df_beta_var, df_beta_cov, scale=0.001)
    logger.info(f"Trait info: {_info(df_trait)}")
    logger.info(f"Trait head:\n{df_trait.head()}")

    output_path = Path(output_dir)
    if output_path.exists():
        logger.info(f"Clearing old output path at {output_path}")
        shutil.rmtree(output_path)
    output_path.mkdir(parents=True)
    logger.info(f"Writing results to {output_path}")

    path = str(output_path / "genotypes")
    hl.export_plink(mt, path)
    logger.info(f"PLINK written to {path}")

    path = str(output_path / "covariates.csv")
    df_cov.reset_index().to_csv(path, index=False)
    logger.info(f"Covariates written to {path}")

    path = str(output_path / "traits.csv")
    df_trait.reset_index().to_csv(path, index=False)
    logger.info(f"Traits written to {path}")

    path = str(output_path / "beta_covariate.csv")
    df_beta_cov.to_csv(path, index=True)
    logger.info(f"Covariate betas written to {path}")

    path = str(output_path / "beta_variant.csv")
    df_beta_var.to_csv(path, index=True)
    logger.info(f"Variant betas written to {path}")

    logger.info("Simulated data generation complete")
Ejemplo n.º 27
0
 def test_import_plink_empty_bim(self):
     mt = get_dataset().filter_rows(False)
     bfile = '/tmp/test_empty_bim'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, ".bim file does not contain any variants"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Ejemplo n.º 28
0
(ct.select(ID=ct.s, sexFstat=ct.f_stat, isFemale=ct.is_female,
           ydp=ct.ydp).export(sample_sex_fstat_file))

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# ld pruning
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("LD pruning...")
vds5_ldp = hl.ld_prune(vds5, n_cores=1600, r2=0.1)
#vds5_ldp = hl.ld_prune(vds5, n_cores=60, r2=0.2, window=1000000, memory_per_core=512)

print("writing LD pruned VDS...")
vds5_ldp.write(vds_ldpruned_common_file, overwrite=True)
hl.export_plink(vds5_ldp,
                vds_ldpruned_common_plink,
                fam_id=vds5_ldp.s,
                id=vds5_ldp.s)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IBD analysis
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# use king until pcrelate works

#vds.ibd(min=0.1).flatten().rename({'ibd.Z0': 'Z0', 'ibd.Z1': 'Z1', 'ibd.Z2': 'Z2', 'ibd.PI_HAT': 'PI_HAT'}).export(ibd_results_file)

# print runtime
stop = timeit.default_timer()
print("runtime: " + str(stop - start) + " seconds")
Ejemplo n.º 29
0
import hail as hl

target_samples = hl.import_table('gs://apcdr/ukb_holdout/ukb31063.gwas_samples.holdout_and_target.txt', key='s')

contig = 'autosomes'
contig_expr = 'chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}'

ht_variants = hl.read_table('gs://ukb31063/ukb31063.neale_gwas_variants.ht')

mt = hl.import_bgen(
        path=f'gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_{contig_expr}_v3.bgen',
        sample_file=f'gs://ukb31063/ukb31063.{contig}.sample',
        entry_fields=['dosage'],
        variants=ht_variants)

mt_target = mt.filter_cols(hl.is_defined(target_samples[mt.s]))  # target

hl.export_plink(mt_target, 'gs://apcdr/ukb_holdout/ukb31063.holdout.target_individuals', ind_id= mt_target.s,
                varid=mt_target.rsid)
Ejemplo n.º 30
0
# ld prune
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

onekg = hl.ld_prune(onekg, n_cores=800, r2=0.2)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write vds
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

onekg.write(onekg_ldpruned_file, overwrite=True)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# write plink
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
print('export plink')
hl.export_plink(onekg, onekg_plink_prefix, fam_id=onekg.s, id=onekg.s)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# pca
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

# print('PCA...')
# eigenvalues, scores, loadings = hl.hwe_normalized_pca(onekg, k=10)

# with hl.utils.hadoop_open(pca_value_prefix + '_all.txt', 'w') as f:
# 	for val in eigenvalues:
# 		f.write(str(val) + '\n')
# scores.flatten().export(pca_score_prefix + '_all.txt')

# onekgeur = onekg.filter_cols(onekg.super == 'EUR', keep=True)
Ejemplo n.º 31
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {
            'f0': 'fam_id',
            'f1': 'ind_id',
            'f2': 'pat_id',
            'f3': 'mat_id',
            'f4': 'is_female',
            'f5': 'pheno'
        }
        bim_mapping = {
            'f0': 'contig',
            'f1': 'varid',
            'f2': 'cm_position',
            'f3': 'position',
            'f4': 'a1',
            'f5': 'a2'
        }

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0")
                     & (fam1.mat_id == "0") & (fam1.is_female == "0")
                     & (fam1.pheno == "NA")))
        self.assertTrue(
            bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" +
                      bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds,
                        out2,
                        ind_id=ds.s,
                        fam_id=ds.s,
                        pat_id="nope",
                        mat_id="nada",
                        is_female=True,
                        pheno=False)
        fam2 = (hl.import_table(out2 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope")
                     & (fam2.mat_id == "nada") & (fam2.is_female == "2")
                     & (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0")
                     & (fam3.mat_id == "0") & (fam3.is_female == "0")
                     & (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim',
                                         out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(
            hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError,
                                    "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
Ejemplo n.º 32
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
Ejemplo n.º 33
0
print_count(mt_clean)

# Changing sex annotation so it will properly be output when converted to plink format
mt_clean = mt_clean.annotate_cols(
    sex=hl.if_else((mt_clean.reported_sex == 'F'), True, False))

# Sanity check sex count after changing format
mt_clean.aggregate_cols(hl.agg.counter(mt_clean.sex == True))

# Reading in csv with FID info
fid = hl.import_table('gs://neurogap/Pilot_Data_HailQC/fid_info.csv',
                      delimiter=',')

# Changing keys to match mt_clean
fid = fid.key_by(fid.siteID)

# Adding on proper FIDs to dataset (for plink output)
mt_clean = mt_clean.annotate_cols(fid=fid[mt_clean.siteID].FID)

# Changing row key to what we want for plink output
mt_clean = mt_clean.key_rows_by(locus=mt_clean['locus'],
                                alleles=mt_clean['alleles'])

# Output to plink, specifying desired fields
hl.export_plink(mt_clean,
                'gs://neurogap-pilot-clean/NeuroGAP_pilot_clean',
                ind_id=mt_clean.collab_PID,
                fam_id=mt_clean.fid,
                is_female=mt_clean.sex,
                varid=mt_clean.rsid)
Ejemplo n.º 34
0
    def test_grm(self):
        tolerance = 0.001

        def load_id_file(path):
            ids = []
            with hl.hadoop_open(path) as f:
                for l in f:
                    r = l.strip().split('\t')
                    self.assertEqual(len(r), 2)
                    ids.append(r[1])
            return ids

        def load_rel(ns, path):
            rel = np.zeros((ns, ns))
            with hl.hadoop_open(path) as f:
                for i, l in enumerate(f):
                    for j, n in enumerate(map(float, l.strip().split('\t'))):
                        rel[i, j] = n
                    self.assertEqual(j, i)
                self.assertEqual(i, ns - 1)
            return rel

        def load_grm(ns, nv, path):
            m = np.zeros((ns, ns))
            with utils.hadoop_open(path) as f:
                i = 0
                for l in f:
                    row = l.strip().split('\t')
                    self.assertEqual(int(row[2]), nv)
                    m[int(row[0]) - 1, int(row[1]) - 1] = float(row[3])
                    i += 1

                self.assertEqual(i, ns * (ns + 1) / 2)
            return m

        def load_bin(ns, path):
            m = np.zeros((ns, ns))
            with utils.hadoop_open(path, 'rb') as f:
                for i in range(ns):
                    for j in range(i + 1):
                        b = f.read(4)
                        self.assertEqual(len(b), 4)
                        m[i, j] = unpack('<f', bytearray(b))[0]
                left = f.read()
                self.assertEqual(len(left), 0)
            return m

        b_file = utils.new_temp_file(prefix="plink")
        rel_file = utils.new_temp_file(prefix="test", suffix="rel")
        rel_id_file = utils.new_temp_file(prefix="test", suffix="rel.id")
        grm_file = utils.new_temp_file(prefix="test", suffix="grm")
        grm_bin_file = utils.new_temp_file(prefix="test", suffix="grm.bin")
        grm_nbin_file = utils.new_temp_file(prefix="test", suffix="grm.N.bin")

        dataset = self.get_dataset()
        n_samples = dataset.count_cols()
        dataset = dataset.annotate_rows(AC=agg.sum(dataset.GT.n_alt_alleles()),
                                        n_called=agg.count_where(hl.is_defined(dataset.GT)))
        dataset = dataset.filter_rows((dataset.AC > 0) & (dataset.AC < 2 * dataset.n_called))
        dataset = dataset.filter_rows(dataset.n_called == n_samples).persist()

        hl.export_plink(dataset, b_file, id=dataset.s)

        sample_ids = [row.s for row in dataset.cols().select('s').collect()]
        n_variants = dataset.count_rows()
        self.assertGreater(n_variants, 0)

        grm = hl.genetic_relatedness_matrix(dataset)
        grm.export_id_file(rel_id_file)

        ############
        ### rel

        p_file = utils.new_temp_file(prefix="plink")
        syscall('''plink --bfile {} --make-rel --out {}'''
                .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL)
        self.assertEqual(load_id_file(p_file + ".rel.id"), sample_ids)

        grm.export_rel(rel_file)
        self.assertEqual(load_id_file(rel_id_file), sample_ids)
        self.assertTrue(np.allclose(load_rel(n_samples, p_file + ".rel"),
                                    load_rel(n_samples, rel_file),
                                    atol=tolerance))

        ############
        ### gcta-grm

        p_file = utils.new_temp_file(prefix="plink")
        syscall('''plink --bfile {} --make-grm-gz --out {}'''
                .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL)
        self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids)

        grm.export_gcta_grm(grm_file)
        self.assertTrue(np.allclose(load_grm(n_samples, n_variants, p_file + ".grm.gz"),
                                    load_grm(n_samples, n_variants, grm_file),
                                    atol=tolerance))

        ############
        ### gcta-grm-bin

        p_file = utils.new_temp_file(prefix="plink")
        syscall('''plink --bfile {} --make-grm-bin --out {}'''
                .format(utils.uri_path(b_file), utils.uri_path(p_file)), shell=True, stdout=DEVNULL, stderr=DEVNULL)

        self.assertEqual(load_id_file(p_file + ".grm.id"), sample_ids)

        grm.export_gcta_grm_bin(grm_bin_file, grm_nbin_file)

        self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.bin"),
                                    load_bin(n_samples, grm_bin_file),
                                    atol=tolerance))
        self.assertTrue(np.allclose(load_bin(n_samples, p_file + ".grm.N.bin"),
                                    load_bin(n_samples, grm_nbin_file),
                                    atol=tolerance))
Ejemplo n.º 35
0
mt.count()

# def rename_samples(mt, mapping):
#     return mt.key_cols_by(s = hl.literal(mapping).get(mt.s, default=mt.s))

# mt = rename_samples(mt, {'431-BG00852 D':'431-BG00852_D'})

for x in range(1, 23):

    mt_chr = hl.filter_intervals(mt, [
        hl.parse_locus_interval(hl.eval('chr' + hl.str(x)),
                                reference_genome='GRCh38')
    ])
    n_chr = mt_chr.count_rows()

    print('\nn variants in chr')
    print(x)
    print(n_chr)

    hl.export_plink(mt_chr, PLINK_FILES + '.chr' + str(x))

mt_chr = hl.filter_intervals(
    mt, [hl.parse_locus_interval('chrX', reference_genome='GRCh38')])
n_chr = mt_chr.count_rows()

print('\nn variants in chrX')
print(n_chr)

hl.export_plink(mt_chr, PLINK_FILES + '.chr' + 'X')
Ejemplo n.º 36
0
 def test_import_plink_empty_fam(self):
     mt = get_dataset().filter_cols(False)
     bfile = '/tmp/test_empty_fam'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, "Empty .fam file"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Ejemplo n.º 37
0
def run_qc(mt: hl.MatrixTable,
           dirname: str,
           basename: str,
           input_type: str,
           pre_geno: float,
           mind: float,
           fhet_y: int,
           fhet_x: int,
           geno: float,
           midi: float,
           maf: float,
           hwe_th_co: float,
           hwe_th_ca: float,
           qc_round: int,
           withpna: int = 0) -> hl.MatrixTable:
    """
    :param mt: Hail MatrixTable
    :param dirname:
    :param basename:
    :param input_type:
    :param pre_geno:
    :param mind:
    :param fhet_y:
    :param fhet_x:
    :param geno:
    :param midi:
    :param maf:
    :param hwe_th_co:
    :param hwe_th_ca:
    :param qc_round:
    :param withpna:
    :return:
    """

    # compute qc metrics
    mt = qc.compute_qc_metrics(mt)

    # Pre-qc counts
    pre_qc_counts = qc.collect_counts(mt)

    # pre-qc plots
    print("Generating pre-QC plots")
    pre_cas_var_base64, pre_con_var_base64 = plt.cr_var_plts(mt, geno)
    pre_cas_id_base64, pre_con_id_base64 = plt.cr_id_plts(mt, mind)

    pre_man_qq_base64 = plt.man_qq_plts(mt)

    # 1. SNP QC: call rate ≥ 0.95
    print("1. SNP QC: call rate ≥ 0.95")
    mt, var_pre_filter = qc.filter_var_cr(mt, pre_geno)
    print("Pre QC call rate < 0.95: {}".format(var_pre_filter['geno_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # 2. Sample QC: call rate in cases or controls ≥ 0.98
    print("2. Sample QC: call rate in cases or controls ≥ 0.98")
    mt, id_cr_filter = qc.filter_sample_cr(mt, mind)
    print("Sample QC < 0.98: {}".format(id_cr_filter['sample_miss_cases'] +
                                        id_cr_filter['sample_miss_controls']))
    print("Samples: {}".format(mt.count_cols()))

    # 3. Sample QC: F_stats
    print("3. Sample QC: F_stats")

    mt, f_stat_results = qc.filter_sex_check(mt, fhet_y, fhet_x)
    print("Sex check filtered: {}".format(f_stat_results['sex_check_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # 4. Sample QC: Sex violations (excluded) - genetic sex does not match pedigree sex
    print(
        "4. Sample QC: Sex violations (excluded) - genetic sex does not match pedigree sex"
    )
    mt, sex_violations = qc.sex_violations(mt, input_type)
    print("Sex violations: {}".format(sex_violations['sex_excluded']))
    print("Samples: {}".format(mt.count_cols()))

    # 5. Sample QC: Sex warnings (not excluded) - undefined phenotype / ambiguous genotypes
    print(
        "# 5. Sample QC: Sex warnings (not excluded) - undefined phenotype / ambiguous genotypes"
    )
    sex_warnings_count = qc.sex_warnings(mt, input_type)
    print("Sex warning: {}".format(sex_warnings_count))
    print("Samples: {}".format(mt.count_cols()))

    # 6. SNP QC: call rate ≥ 0.98
    print("# 6. SNP QC: call rate ≥ 0.98")
    mt, var_filter = qc.filter_var_cr(mt, geno)
    print("SNP QC call rate < 0.98: {}".format(var_filter['geno_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # 7. SNP QC: missing difference > 0.02
    print("# 7. SNP QC: missing difference > 0.02")

    # 8. SNP QC: SNPs with no valid association p value are excluded (i.e., invariant SNP)
    print(
        "# 8. SNP QC: SNPs with no valid association p value are excluded (i.e., invariant SNP)"
    )
    if withpna == 0:
        mt, invariant_snps = qc.filter_invariant_snps(mt)
        print("Monormorphic SNPs: {}".format(
            invariant_snps['monomorphic_snps']))
        print("Samples: {}".format(mt.count_cols()))

    # 9. SNP QC: with MAF ≥ 0.01
    print("# 9. SNP QC: with MAF ≥ 0.01")
    mt, maf_results = qc.filter_maf(mt, maf)
    print("MAF: {}".format(maf_results['maf_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # 10. SNP QC: Hardy-Weinberg equilibrium (HWE) in controls p value ≥ 1e-06
    print(
        "# 10. SNP QC: Hardy-Weinberg equilibrium (HWE) in controls p value ≥ 1e-06"
    )
    mt, hwe_con_results = qc.filter_hwe(mt, 'Control', hwe_th_co)
    print("HWE Controls: {}".format(hwe_con_results['maf_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # 11. SNP QC: Hardy-Weinberg equilibrium (HWE) in cases p value ≥ 1e-10
    print(
        "# 11. SNP QC: Hardy-Weinberg equilibrium (HWE) in cases p value ≥ 1e-10"
    )
    mt, hwe_cas_results = qc.filter_hwe(mt, 'Case', hwe_th_ca)
    print("HWE Cases: {}".format(hwe_cas_results['maf_removed']))
    print("Samples: {}".format(mt.count_cols()))

    # Post-qc counts
    post_qc_counts = qc.collect_counts(mt)

    # Post-QC plots
    print("Generating post-QC plots")
    print("Generating variant call rate plots")
    pos_cas_var_base64, pos_con_var_base64 = plt.cr_var_plts(mt, geno)
    print("Generating sample call rate plots")
    pos_cas_id_base64, pos_con_id_base64 = plt.cr_id_plts(mt, mind)
    print("Generating Manhattand & QQ plots")
    pos_man_qq_base64 = plt.man_qq_plts(mt)

    # # pre_cas_var_base64, pre_cas_id_base64, pre_con_var_base64, pre_con_id_base64
    qc_plots_list = [
        pre_man_qq_base64, pos_man_qq_base64, pre_con_id_base64,
        pre_cas_id_base64, pos_con_id_base64, pos_cas_id_base64,
        f_stat_results['sex_check_plot'], pre_con_var_base64,
        pre_cas_var_base64, pos_con_var_base64, pos_cas_var_base64
    ]

    # Tables
    filter_counts_list = [
        var_pre_filter['geno_removed'], id_cr_filter['sample_miss_cases'] +
        id_cr_filter['sample_miss_controls'],
        f_stat_results['sex_check_removed'], sex_violations['sex_excluded'],
        sex_warnings_count, var_filter['geno_removed'],
        invariant_snps['monomorphic_snps'], hwe_con_results['maf_removed'],
        hwe_cas_results['maf_removed']
    ]
    size_of_sample_html, exlusion_overview_html = generate_tables(
        pre_qc_counts, post_qc_counts, filter_counts_list)
    qc_tables_list = [size_of_sample_html, exlusion_overview_html]

    outplink = dirname + basename + '_qc{}'.format(qc_round)
    hl.export_plink(mt, outplink)

    return qc_tables_list, qc_plots_list