Example #1
0
    def test_sample_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1]), 'GQ': 15, 'DP': 5},
            {'v': '1:3:A:G,C', 's': '1', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 4},
            {'v': '1:4:G:A', 's': '1', 'GT': hl.Call([0, 1]), 'GQ': None, 'DP': 5},
            {'v': '1:5:C:CG', 's': '1', 'GT': hl.Call([1, 1]), 'GQ': 20, 'DP': 3},
            {'v': '1:6:C:A', 's': '1', 'GT': None, 'GQ': 0, 'DP': None},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Example #2
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")
pop_file = 'gs://rcstorage/population/ccdgf2_predicted_ethnicity_PC1-15.tsv'

# define output files
sample_qc_info_preqc_file = 'gs://rcstorage/qced/' + chrom + '/ccdgf2_sample_qc_info_preqc.txt'

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# II. Annotate samples with population
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("annotating ethnicity")
table = hl.import_table(pop_file, impute=True).key_by('Sample')
vds = vds.annotate_cols(**table[vds.s])

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# III. Performing sample QC on remaining variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("sample QC...")
vds = hl.sample_qc(vds)

print("writing sample QC results...")
vds.cols().flatten().export(sample_qc_info_preqc_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# print Runtime
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stop = timeit.default_timer()

print("runtime: " + str(stop - start) + " seconds")
Example #4
0
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/chr1_chr20_ldpruned_updated.mt")
    # pca_scores_pop
    pca_scores_pop = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020.ht")

    # pca_scores_superpop
    pca_scores_superpop = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/new_labels/pop_assignments_updated_august2020_superpops.ht")

    # annotate mt with pop and superpop
    mt = mt.annotate_cols(assigned_pop=pca_scores_pop[mt.s].pop)
    mt = mt.annotate_cols(assigned_superpop=pca_scores_superpop[mt.s].pop)

    # do sample_qc
    # calculate and annotate with metric heterozygosity
    mt_with_sampleqc = hl.sample_qc(mt, name='sample_qc')

    mt_with_sampleqc = mt_with_sampleqc.annotate_cols(sample_qc=mt_with_sampleqc.sample_qc.annotate(
        heterozygosity_rate=mt_with_sampleqc.sample_qc.n_het/mt_with_sampleqc.sample_qc.n_called))
    # save sample_qc and heterozygosity table as ht table
    mt_with_sampleqc.write(
        f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.mt", overwrite=True)
    mt_with_sampleqc.cols().write(
        f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht",  overwrite=True)
    pop_ht = hl.read_table(
        f"{tmp_dir}/ddd-elgh-ukbb/mt_pops_superpops_sampleqc.ht")
    # run function on metrics including heterozygosity first for pops:
    qc_metrics = ['heterozygosity_rate', 'n_snp', 'r_ti_tv',
                  'r_insertion_deletion', 'n_insertion', 'n_deletion', 'r_het_hom_var']
    pop_filter_ht = compute_stratified_metrics_filter(
        pop_ht, qc_metrics, ['assigned_pop'])
Example #5
0
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(
    global_field_1=5,
    global_field_2=10,
    pli={
Example #6
0
    hl.parse_locus_interval(x, reference_genome='GRCh38') for x in
    ['chr1:START-chr22:END', 'chrX:START-chrX:END', 'chrY:START-chrY:END']
]
mt = hl.filter_intervals(mt, intervals)

# Filter out the invariant rows.
mt = hl.variant_qc(mt, name='qc')
mt = mt.filter_rows((mt.qc.AF[0] > 0.0) & (mt.qc.AF[0] < 1.0))

mt_rows_filter = mt.rows().select().export(PADDED_150_INITIAL_VARIANT_LIST)

n_variants = hl.import_table(PADDED_150_INITIAL_VARIANT_LIST).count()
print('n variants after initial filter:')
print(n_variants)

mt = hl.sample_qc(mt, name='qc_150')

mt = mt.filter_rows(mt.not_in_padded_target_intervals_100, keep=False)
mt_rows_filter = mt.rows().select().export(PADDED_100_INITIAL_VARIANT_LIST)
n_variants = hl.import_table(PADDED_100_INITIAL_VARIANT_LIST).count()
print('n variants after initial filter:')
print(n_variants)

mt = hl.sample_qc(mt, name='qc_100')

mt = mt.filter_rows(mt.not_in_padded_target_intervals_50, keep=False)
mt_rows_filter = mt.rows().select().export(PADDED_50_INITIAL_VARIANT_LIST)
n_variants = hl.import_table(PADDED_50_INITIAL_VARIANT_LIST).count()
print('n variants after initial filter:')
print(n_variants)
num2 = vds_post.count()
print(num2)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# IV. Filtering variants without PASS
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("removing variants...")
vds_post = vds_post.filter_rows(vds_post.label == 'PASS', keep=True)

num3 = vds_post.count()
print(num3)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# V. Performing sample QC on remaining variants
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

print("sample QC...")
vds_post = hl.sample_qc(vds_post)

print("writing sample QC results...")
vds_post.cols().flatten().export(sample_qc_info_postqc_file)

#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# print Runtime
#~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

stop = timeit.default_timer()

print("runtime: " + str(stop - start) + " seconds")
def sample_qc():
    hl.sample_qc(get_mt()).cols()._force_count()
def variant_and_sample_qc():
    mt = get_mt()
    hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
Example #10
0
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
                    "Other"))))

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt",  overwrite=True)
    print("Finished splitting and writing mt. ")

    intersection_table = hl.import_bed(
        intersection_bed, reference_genome='GRCh38')
    
    union_table = hl.import_bed(union_bed, reference_genome='GRCh38')
    
    mt_intersection = mt.filter_rows(
        hl.is_defined(intersection_table[mt.locus]))
    mt_union = mt.filter_rows(hl.is_defined(union_table[mt.locus]))

    mt_intersection = hl.sample_qc(mt_intersection, name='sample_QC_Hail')
    pandadf1 = mt_intersection.cols().flatten()
    print("Outputting table of sample qc")
    pandadf1.export(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_intersection_BED_sampleQC.tsv.bgz", header=True)

    mt_intersection = mt_intersection.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-intersection_BED.mt", overwrite=True)

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_sex_annotated.mt", overwrite=True)

    mt_union = hl.sample_qc(mt_union, name='sample_QC_Hail')
    pandadf2 = mt_union.cols().flatten()
    print("Outputting table of sample qc")
    pandadf2.export(
Example #12
0
    ###################### UNFILTERED SAMPLE AND VARIANT QC #############
    #####################################################################

    print('Annotating rows with snp and indel info')
    mt = mt_split.annotate_rows(Variant_Type=hl.cond(
        (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
        hl.cond(
            hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]),
                    "INDEL", "Other"))))

    # Unfiltered data summary stats:
    print("Finished annotating rows, annotating columns now")
    mt_sqc1_unfiltered = mt.annotate_cols(
        sample_QC_nonHail=sample_QC_nonHail.key_by("ID")[mt.s])
    mt_sqc2_unfiltered = hl.sample_qc(mt_sqc1_unfiltered,
                                      name='sample_QC_Hail')

    panda_df_unfiltered_table = mt_sqc2_unfiltered.cols().flatten()

    print("Outputting table of sample qc")
    panda_df_unfiltered_table.export(
        f"{BUCKET}/output-tables/{CHROMOSOME}/{CHROMOSOME}_sampleQC_unfiltered.tsv.bgz",
        header=True)

    # Variant QC
    mt2 = hl.variant_qc(mt_sqc2_unfiltered, name='variant_QC_Hail')

    print('Exporting variant qc pandas table to disk')
    mt_rows = mt2.rows()
    mt_rows.select(mt_rows.variant_QC_Hail).flatten().export(
        f"{BUCKET}/output-tables/{CHROMOSOME}/{CHROMOSOME}_variantQC_unfiltered.tsv.bgz",
Example #13
0
# The purpose of this script is to format and write out a matrix table which will be used to create 'table_x'
# for our resource manuscript
# author: Zan Koenig

import hail as hl
hl.init()

# reading in the post QC version of the merged dataset (with metadata)
mt = hl.read_matrix_table('gs://african-seq-data/hgdp_tgp/hgdp_tgp_postQC.mt')

# Running sample_qc to get the n_snp and n_singleton counts
mt = hl.sample_qc(mt, name = "new_sample_qc")

# Grabbing only the columns from the matrix table (outputs table of just columns)
col_table = mt.cols()


# writing out a col table with only the columns needed for table x
col_table = col_table.select(col_table.hgdp_tgp_meta.Study.region,
                             col_table.hgdp_tgp_meta.Population,
                             col_table.new_sample_qc.n_snp,
                             col_table.new_sample_qc.n_singleton,
                             col_table.bam_metrics.mean_coverage)


# writing out col_table as a checkpoint to make the downstream steps run faster
col_table.checkpoint('gs://african-seq-data/hgdp_tgp/table_x_checkpoint.ht')


# this is a table of only the columns with only information
col_table = hl.read_table('gs://african-seq-data/hgdp_tgp/table_x_checkpoint.ht')
Example #14
0
              global_ADhet_25=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.25),
              global_ADhet_30=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.30),
              global_ADhet_35=hl.agg.mean(mt_het.AD[1] / mt_het.DP < 0.35),
              global_ADhom=hl.agg.stats(
                  (mt_het.AD[1] + mt_het.AD[1]) / mt_het.DP)))

print(het_struct)

mt_hom_var = mt.filter_entries(mt.GT.is_hom_var())
hom_struct = mt_hom_var.aggregate_entries(
    hl.struct(
        global_ADhet=hl.agg.stats(mt_hom_var.AD[1] / mt_hom_var.DP),
        global_ADhom=hl.agg.stats(
            (mt_hom_var.AD[0] + mt_hom_var.AD[1]) / mt_hom_var.DP),
        global_ADhom_80=hl.agg.mean(
            (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.8),
        global_ADhom_85=hl.agg.mean(
            (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.85),
        global_ADhom_90=hl.agg.mean(
            (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.9),
        global_ADhom_95=hl.agg.mean(
            (mt_hom_var.AD[1] + mt_hom_var.AD[1]) / mt_hom_var.DP < 0.95)))
print(hom_struct)

# Also, wish to examine the Ti/Tv ratio within the calling intervals (excluding the padding).

mt = mt.filter_rows(~mt.not_in_target_intervals)
mt = hl.sample_qc(mt, name='sample_qc_in_target')
mt.cols().select("imputesex", "sample_qc",
                 "sample_qc_in_target").flatten().export(SAMPLE_QC_IN_TARGET)
Example #15
0
        'locus').distinct_by_row().key_rows_by('locus', 'alleles')
    mt_split = hl.split_multi_hts(mt_annotated,
                                  keep_star=False,
                                  left_aligned=False)

    mt = mt_split.annotate_rows(Variant_Type=hl.cond(
        (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
        hl.cond(
            hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]),
                    "INDEL", "Other"))))

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-split-multi_cohorts.mt",
        overwrite=True)
    print("Finished splitting and writing mt. ")

    agilent_table = hl.import_bed(agilent, reference_genome='GRCh38')
    mt_agilent = mt.filter_rows(hl.is_defined(agilent_table[mt.locus]))

    mt_agilent = hl.sample_qc(mt_agilent, name='sample_QC_Hail')
    pandadf1 = mt_agilent.cols().flatten()
    print("Outputting table of sample qc")
    pandadf1.export(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}_agilent_sampleQC.tsv.bgz",
        header=True)

    mt = mt.checkpoint(
        f"{tmp_dir}/ddd-elgh-ukbb/{CHROMOSOME}-sampleqc-unfiltered_annotated.mt",
        overwrite=True)
Example #16
0
def generate_datasets(doctest_namespace, output_dir):
    doctest_namespace['hl'] = hl

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'{output_dir.name}/example.vds', overwrite=True)
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._ndarray([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'{output_dir.name}/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    print("finished setting up doctest...")
import hail as hl

hl.init()

mt = hl.read_matrix_table(
    'gs://gcp-public-data--gnomad/release/3.1/mt/genomes/'
    'gnomad.genomes.v3.1.hgdp_1kg_subset_dense.mt')

# mt epxloration - explore rows and columns
mt.count_rows()
mt.count_cols()
mt.cols().show()
mt.rows().show()

# mt qc check
mt_qc = hl.sample_qc(mt)
p = hl.plot.histogram(mt_qc.sample_qc.call_rate,
                      range=(0.88, 1),
                      legend='Call Rate')
p_2 = hl.plot.histogram(mt_qc.sample_qc.gq_stats.mean, legend='Mean Sample GQ')

# PCA
columns = mt.cols()
pca_scores = columns.population_inference.pca_scores
labels = columns.population_inference.pop
pops = list(set(labels.collect()))
mapper = CategoricalColorMapper(palette=turbo(8), factors=pops)

# plot the first 5 PCs
p = hl.plot.scatter(
    pca_scores[0],
Example #18
0
mt_before = mt_before.annotate_cols(
    phenotype=sample_annotations[mt_before.col_key])
mt_before = mt_before.annotate_cols(
    imputesex=impute_sex_annotations[mt_before.col_key])

mt_before = hl.variant_qc(mt_before, name='qc')

mt_before = mt_before.annotate_rows(qc=mt_before.qc.annotate(
    AC=mt_before.qc.AC[1],
    AF=mt_before.qc.AF[1],
    homozygote_count=mt_before.qc.homozygote_count[1]))

mt_before = mt_before.filter_rows((mt_before.qc.AF > 0)
                                  & (mt_before.qc.AF < 1))
mt_before = hl.sample_qc(mt_before)

n = mt_before.count()

print('n samples:')
print(n[1])
print('n variants:')
print(n[0])

mt_before = mt_before.annotate_cols(sex=hl.case().when(
    mt_before.imputesex.impute_sex.is_female, "Female").default("Male"))

mt_after = mt_before.filter_rows(
    hl.is_defined(ht_final_variants[mt_before.row_key]))
mt_after = hl.sample_qc(mt_after)
#this is a python script loosely based on Kumar and Konrad's effort here: https://github.com/mkveerapen/covid19_sequencing
#again, some of the QC at our institution was done by our genome center, and therefore you should refer to the above link for more thorough QC
#specifically, variant recalibration should still be done, even if not shown here, can discuss with me on how to do it using gatk.

import hail as hl

#tmp_dir is where some of the temporary computations are done. I would make sure to assign it to a folder that does not have a strict data cap.
hl.init(spark_conf=None, tmp_dir='/path/to/tmp_dir/')

#import the data and sample QC
hl.import_vcf('/path/to/sequence.file.normID.noChrM.vcf.gz',
              min_partitions=4,
              reference_genome='GRCh38',
              force_bgz=True).write('/hailFiles/hail.full.normID.noChrM.mt',
                                    overwrite=True)

mtAll = hl.read_matrix_table('/hailFiles/hail.full.noChrM.mt')
mtAll = mtAll.annotate_entries(AB=(mtAll.AD[1] / hl.sum(mtAll.AD)))
mtAll = hl.sample_qc(mtAll)
mtAll = mtAll.filter_cols((mtAll.sample_qc.call_rate >= 0.97)
                          & (mtAll.sample_qc.dp_stats.mean >= 20))
mtAll = mtAll.filter_entries((mtAll.GQ >= 20) & (mtAll.DP >= 10) & (
    (mtAll.GT.is_hom_ref() & (mtAll.AB <= 0.1))
    | (mtAll.GT.is_het() & (mtAll.AB >= 0.25) & (mtAll.AB <= 0.75))
    | (mtAll.GT.is_hom_var() & (mtAll.AB >= 0.9))))

hl.export_vcf(mtAll, '/path/to/sequence.file.normID.GTflt.AB.noChrM.vcf.gz')
Example #20
0
# In[6]:
# MT paths is a list of file paths for each of the datasets to be merged and QC'd
mt_paths = ['file/path1', 'file/path2']

# In[7]:

# Reading in and creating a list of all of the site matrix tables
mt_list = [hl.import_vcf(mt_path, force_bgz=True) for mt_path in mt_paths]

# Importing the metadata file as a hail table
meta = hl.import_table(meta_data)

# In[8]:

# Annotating the matrix tables with sample QC data
mt_list = [hl.sample_qc(mt, name='sample_qc') for mt in mt_list]

# In[89]:

# Annotating the matrix tables with variant QC data
mt_list = [hl.variant_qc(mt, name='variant_qc') for mt in mt_list]

# In[90]:

# Annotating matrix tables with metadata from the meta table (see annotateMeta for details)
mt_list = [annotateMeta(mt, meta, 'chip_well_barcode') for mt in mt_list]

# In[91]:

# Annotating matrix tables with sex filter results (see checkSex for details)
mt_list = [checkSex(mt) for mt in mt_list]
INITIAL_SAMPLE_QC_FILE_INV_REMOVED = 'gs://dalio_bipolar_w1_w2_hail_02/data/samples/03_initial_sample_qc_b37_callset_inv_removed.tsv'

variants_to_filter = hl.import_table(INITIAL_VARIANT_AUTO_LIST,
                                     types={
                                         'locus': hl.tlocus(),
                                         'alleles': hl.tarray(hl.tstr)
                                     })
variants_to_filter = variants_to_filter.key_by(
    locus=variants_to_filter.locus, alleles=variants_to_filter.alleles)

sample_annotations = hl.read_table(PHENOTYPES_TABLE)

pprint(sample_annotations.describe())

mt = hl.read_matrix_table(MT)
mt = mt.filter_rows(hl.is_defined(variants_to_filter[mt.row_key]))
mt = mt.annotate_cols(phenotype=sample_annotations[mt.s])
mt_invariant_included = hl.sample_qc(mt, name='qc')

mt_invariant_included.cols().select(
    'phenotype', 'qc').flatten().export(output=INITIAL_SAMPLE_QC_FILE)

mt = hl.variant_qc(mt, name='qc')
mt_invariant_removed = mt.filter_rows((mt.qc.AF[0] > 0.0)
                                      & (mt.qc.AF[0] < 1.0))
mt_invariant_removed = hl.sample_qc(mt_invariant_removed, name='qc_sample')

mt_invariant_removed.cols().select(
    'phenotype',
    'qc_sample').flatten().export(output=INITIAL_SAMPLE_QC_FILE_INV_REMOVED)
Example #22
0
def sample_qc(mt_path):
    hl.sample_qc(hl.read_matrix_table(mt_path)).cols()._force_count()
Example #23
0
def samples_qc(mt, mt_to_annotate, args):
    """
    Performs samples QC on a matrix table, removing samples on chimera and contamination %, as well as being +/- 4
    standard deviations from mean on TiTv, het/homvar, insertion/deletion ratios and n_singletons for a specific
    batch or cohort

    :param mt: matrix table, low-pass failing variants and genotypes filtered out
    :param mt_to_annotate: matrix table to annotate with failing samples information after calculating on filtered mt
    :param args:
    :return: returns annotated, unfiltered matrix table
    """
    datestr = time.strftime("%Y.%m.%d")

    # Run variant QC to get up to date variant QC metrics for samples QC
    mt = hl.sample_qc(mt)

    # Pull data to cols and checkpoint
    mt_cols = mt.cols()
    mt_cols = mt_cols.checkpoint("samples_qc_cols_tmp.ht", overwrite=True)

    # Instantiate empty array for failing samples QC tags
    mt_cols = mt_cols.annotate(failing_samples_qc=hl.empty_array(hl.tstr))

    ############################################################
    # Find samples failing on chimeras or contamination values #
    ############################################################
    mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
        (mt_cols[args.chimeras_col] > args.chimeras_max)
        & hl.is_defined(mt_cols[args.chimeras_col]),
        mt_cols.failing_samples_qc.append(
            "failing_chimeras"), mt_cols.failing_samples_qc))

    mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
        (mt_cols[args.contamination_col] > args.contamination_max)
        & hl.is_defined(mt_cols[args.contamination_col]),
        mt_cols.failing_samples_qc.append(
            "failing_contamination"), mt_cols.failing_samples_qc))

    failing_chim = mt_cols.aggregate(
        hl.agg.count_where(
            mt_cols.failing_samples_qc.contains("failing_chimeras")))
    miss_chim = mt_cols.aggregate(
        hl.agg.count_where(~(hl.is_defined(mt_cols[args.chimeras_col]))))
    failing_contam = mt_cols.aggregate(
        hl.agg.count_where(
            mt_cols.failing_samples_qc.contains("failing_contamination")))
    miss_contam = mt_cols.aggregate(
        hl.agg.count_where(~(hl.is_defined(mt_cols[args.contamination_col]))))

    logging.info(
        f"Number of samples failing on chimeras % > {args.chimeras_max}: {failing_chim}"
    )
    logging.info(f"Number of samples missing chimeras %: {miss_chim}")
    logging.info(
        f"Number of samples failing on contamination % > {args.contamination_max}: {failing_contam}"
    )
    logging.info(f"Number of samples missing contamination %: {miss_contam}")

    chim_stats = mt_cols.aggregate(hl.agg.stats(mt_cols[args.chimeras_col]))
    cont_stats = mt_cols.aggregate(
        hl.agg.stats(mt_cols[args.contamination_col]))
    logging.info(f"Chimeras statistics: {chim_stats}")
    logging.info(f"Contamination statistics: {cont_stats}")

    ###############################################
    # Find samples failing on sex-aware call rate #
    ###############################################
    if args.sample_call_rate is not None:
        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            (mt_cols.sexaware_sample_call_rate < args.sample_call_rate)
            & hl.is_defined(mt_cols.sexaware_sample_call_rate),
            mt_cols.failing_samples_qc.append(
                "failing_sexaware_sample_call_rate"),
            mt_cols.failing_samples_qc))

        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            ~(hl.is_defined(mt_cols.sexaware_sample_call_rate)),
            mt_cols.failing_samples_qc.append(
                "missing_sexaware_sample_call_rate"),
            mt_cols.failing_samples_qc))

        failing_cr = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(
                    "failing_sexaware_sample_call_rate")))
        missing_cr = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(
                    "missing_sexaware_sample_call_rate")))

        logging.info(
            f"Number of samples failing on sex-aware call rate > {args.sample_call_rate}: {failing_cr}"
        )
        logging.info(
            f"Number of samples missing sex-aware call rate : {missing_cr}")

        cr_stats = mt_cols.aggregate(
            hl.agg.stats(mt_cols.sexaware_sample_call_rate))

        logging.info(f"Sex-aware call rate statistics: {cr_stats}")

    ######################################################################################
    # Find samples failing per-cohort on titv, het_homvar ratio, indel, and # singletons #
    ######################################################################################
    if args.batch_col_name is not None:
        batch_none = mt_cols.aggregate(
            hl.agg.count_where(~(hl.is_defined(mt_cols[args.batch_col_name]))))
        mt_cols = mt_cols.annotate(
            **{
                args.batch_col_name:
                hl.or_else(mt_cols[args.batch_col_name], "no_batch_info")
            })

        if batch_none > 0:
            logging.info(
                f"Warning- {batch_none} samples have batch undefined. These samples will be grouped in one"
                f"batch for sample QC (named no_batch_info).")
            mt_cols.filter_cols(mt_cols[args.batch_col_name] ==
                                "no_batch_info").s.show(batch_none + 1)

        batch_set = mt_cols.aggregate(
            hl.agg.collect_as_set(mt_cols[args.batch_col_name]))
    else:
        args.batch_col_name = "mock_batch_col"
        mt_cols = mt_cols.annotate(mock_batch_col="all")
        batch_set = ["all"]

    # Convert batch strings to numeric values, create label for plotting
    batch_set_numeric = list(range(len(batch_set)))
    batch_key = list(zip(batch_set, batch_set_numeric))

    mt_cols = mt_cols.annotate(plot_batch=0)
    for batch in batch_key:
        mt_cols = mt_cols.annotate(
            plot_batch=hl.cond(mt_cols[args.batch_col_name] == batch[0],
                               batch[1], mt_cols.plot_batch))
        mt_cols = mt_cols.annotate(plot_batch_jitter=mt_cols.plot_batch +
                                   hl.rand_unif(-0.3, 0.3))

    batch_thresholds = {}
    batch_statistics = {}
    for measure in [
            'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton'
    ]:
        logging.info(f"Performing sample QC for measure {measure}")

        # Instantiate/reset box plot label
        mt_cols = mt_cols.annotate(boxplot_label=mt_cols[args.batch_col_name])

        batch_thresholds[measure] = {}
        batch_statistics[measure] = {}

        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            ~(hl.is_defined(mt_cols.sample_qc[measure])),
            mt_cols.failing_samples_qc.append(f"missing_{measure}"),
            mt_cols.failing_samples_qc))

        for batch in batch_set:
            # See if values exist at all for all values
            defined_values = mt_cols.aggregate(
                hl.agg.count_where(hl.is_defined(mt_cols.sample_qc[measure])))

            if defined_values > 0:
                # Get mean and standard deviation for each measure, for each batch's samples
                stats = mt_cols.aggregate(
                    hl.agg.filter(mt_cols[args.batch_col_name] == batch,
                                  hl.agg.stats(mt_cols.sample_qc[measure])))

                # Get cutoffs for each measure
                cutoff_upper = stats.mean + (args.sampleqc_sd_threshold *
                                             stats.stdev)
                cutoff_lower = stats.mean - (args.sampleqc_sd_threshold *
                                             stats.stdev)

                if measure == "n_singleton":
                    logging.info(
                        f"Max number of singletons for batch {batch}: {stats.max}"
                    )

                mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
                    ((mt_cols.sample_qc[measure] > cutoff_upper)
                     | (mt_cols.sample_qc[measure] < cutoff_lower))
                    & hl.is_defined(mt_cols.sample_qc[measure])
                    & (mt_cols[args.batch_col_name] == batch),
                    mt_cols.failing_samples_qc.append(
                        f"failing_{measure}"), mt_cols.failing_samples_qc))

                mt_cols = mt_cols.annotate(boxplot_label=hl.cond(
                    ((mt_cols.sample_qc[measure] > cutoff_upper)
                     | (mt_cols.sample_qc[measure] < cutoff_lower))
                    & hl.is_defined(mt_cols.sample_qc[measure])
                    & (mt_cols[args.batch_col_name] == batch), "outlier",
                    mt_cols.boxplot_label))

                # Collect thresholds and statistics for each batch
                batch_thresholds[measure][batch] = {
                    'min_thresh': cutoff_lower,
                    'max_thresh': cutoff_upper
                }
                batch_statistics[measure][batch] = stats

            else:
                logging.error(
                    f"Error- no defined values for measure {measure}. NAs can be introduced by division by "
                    f"zero. Samples not filtered on {measure}!")

        # Create plot for measure for each batch
        output_file(f"{datestr}_samples_qc_plots_{measure}.html")
        p = hl.plot.scatter(mt_cols.plot_batch_jitter,
                            mt_cols.sample_qc[measure],
                            label=mt_cols.boxplot_label,
                            title=f"{measure} values split by batch.")
        save(p)

    ##########################
    # Report failing samples #
    ##########################
    for measure in [
            'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton'
    ]:
        failing_count = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(f"failing_{measure}")))
        missing_count = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(f"missing_{measure}")))
        logging.info(
            f"Number of samples failing on {measure}: {failing_count}")
        logging.info(f"Number of samples missing {measure}: {missing_count}")

    failing_any = mt_cols.aggregate(
        hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))
    logging.info(
        f"Number of samples failing samples QC on any measure: {failing_any}")

    if args.pheno_col is not None:
        cases_failing = mt_cols.aggregate(
            hl.agg.filter(
                mt_cols[args.pheno_col] == True,
                hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)))
        controls_failing = mt_cols.aggregate(
            hl.agg.filter(
                mt_cols[args.pheno_col] == False,
                hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)))
        logging.info(f"Cases failing QC: {cases_failing}")
        logging.info(f"Controls failing QC: {controls_failing}")

    #######################################################################################################
    # Annotate original (unfiltered) matrix table with failing samples QC information + sample QC measure #
    #######################################################################################################
    mt_to_annotate = mt_to_annotate.annotate_cols(
        sample_qc=mt_cols[mt_to_annotate.s].sample_qc)
    mt_to_annotate = mt_to_annotate.annotate_cols(
        failing_samples_qc=mt_cols[mt_to_annotate.s].failing_samples_qc)

    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_stats_batches=batch_statistics)
    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_stats_chim_cont={
            'chimeras': chim_stats,
            'contamination': cont_stats
        })
    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_thresholds={
            'chimeras_max': str(args.chimeras_max),
            'contamination_max': str(args.contamination_max),
            'deviation_multiplier_threshold': str(args.sampleqc_sd_threshold),
            'batches': str(batch_set),
            'batch_cohort_name': str(args.batch_col_name)
        })

    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_batch_thresholds=batch_thresholds)

    return mt_to_annotate
Example #24
0
def variant_and_sample_qc(mt_path):
    mt = hl.read_matrix_table(mt_path)
    hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
Example #25
0
    def test_sample_qc(self):
        data = [
            {
                'v': '1:1:A:T',
                's': '1',
                'GT': hl.Call([0, 0]),
                'GQ': 10,
                'DP': 0
            },
            {
                'v': '1:2:A:T,C',
                's': '1',
                'GT': hl.Call([1]),
                'GQ': 15,
                'DP': 5
            },
            {
                'v': '1:3:A:G,C',
                's': '1',
                'GT': hl.Call([2, 2]),
                'GQ': 10,
                'DP': 4
            },
            {
                'v': '1:4:G:A',
                's': '1',
                'GT': hl.Call([0, 1]),
                'GQ': None,
                'DP': 5
            },
            {
                'v': '1:5:C:CG',
                's': '1',
                'GT': hl.Call([1, 1]),
                'GQ': 20,
                'DP': 3
            },
            {
                'v': '1:6:C:A',
                's': '1',
                'GT': None,
                'GQ': 0,
                'DP': None
            },
        ]

        ht = hl.Table.parallelize(
            data,
            hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.sample_qc(mt, 'sqc')
        r = mt.cols().select('sqc').collect()

        self.assertAlmostEqual(r[0].sqc.gq_stats.mean, 11)
        self.assertAlmostEqual(r[0].sqc.gq_stats.stdev, 6.6332495807)
        self.assertAlmostEqual(r[0].sqc.gq_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.gq_stats.max, 20)
        self.assertAlmostEqual(r[0].sqc.dp_stats.mean, 3.399999999)
        self.assertAlmostEqual(r[0].sqc.dp_stats.stdev, 1.8547236990)
        self.assertAlmostEqual(r[0].sqc.dp_stats.min, 0)
        self.assertAlmostEqual(r[0].sqc.dp_stats.max, 5)
        self.assertAlmostEqual(r[0].sqc.call_rate, 0.8333333333)
        self.assertEqual(r[0].sqc.n_called, 5)
        self.assertEqual(r[0].sqc.n_not_called, 1)
        self.assertEqual(r[0].sqc.n_hom_ref, 1)
        self.assertEqual(r[0].sqc.n_het, 1)
        self.assertEqual(r[0].sqc.n_hom_var, 3)
        self.assertEqual(r[0].sqc.n_insertion, 2)
        self.assertEqual(r[0].sqc.n_deletion, 0)
        self.assertEqual(r[0].sqc.n_singleton, 3)
        self.assertEqual(r[0].sqc.n_transition, 1)
        self.assertEqual(r[0].sqc.n_transversion, 3)
        self.assertEqual(r[0].sqc.n_star, 0)
        self.assertEqual(r[0].sqc.n_non_ref, 4)
        self.assertAlmostEqual(r[0].sqc.r_ti_tv, 0.333333333)
        self.assertAlmostEqual(r[0].sqc.r_het_hom_var, 0.3333333333)
        self.assertAlmostEqual(r[0].sqc.r_insertion_deletion, None)
Example #26
0
def main(args):
    global output_prefix
    output_prefix = args.output_dir.rstrip("/") + "/" + splitext(
        basename(args.input_mt))[0]

    if args.compute_qc_mt:
        qc_mt = get_qc_mt(hl.read_matrix_table(args.input_mt))
        qc_mt = qc_mt.repartition(n_partitions=200)
        qc_mt.write(path('qc.mt'), overwrite=args.overwrite)

    if args.compute_qc_metrics:
        logger.info("Computing sample QC")
        mt = filter_to_autosomes(hl.read_matrix_table(args.input_mt))
        strats = {
            'bi_allelic': bi_allelic_expr(mt),
            'multi_allelic': ~bi_allelic_expr(mt)
        }
        for strat, filter_expr in strats.items():
            strat_sample_qc_ht = hl.sample_qc(
                mt.filter_rows(filter_expr)).cols()
            strat_sample_qc_ht.write(path(f'{strat}_sample_qc.ht'),
                                     overwrite=args.overwrite)
        strat_hts = [
            hl.read_table(path(f'{strat}_sample_qc.ht')) for strat in strats
        ]
        sample_qc_ht = strat_hts.pop()
        sample_qc_ht = sample_qc_ht.select(
            sample_qc=merge_sample_qc_expr([sample_qc_ht.sample_qc] + [
                strat_hts[i][sample_qc_ht.key].sample_qc
                for i in range(0, len(strat_hts))
            ]))
        sample_qc_ht.write(path('sample_qc.ht'), overwrite=args.overwrite)

    if args.compute_callrate_mt:
        callrate_mt = compute_callrate_mt(
            hl.read_matrix_table(args.input_mt),
            hl.import_locus_intervals(exome_calling_intervals_path))
        callrate_mt.write(path('callrate.mt'), args.overwrite)

    if args.run_platform_pca:
        eigenvalues, scores_ht, loadings_ht = run_platform_pca(
            hl.read_matrix_table(path('callrate.mt')))
        scores_ht.write(path('platform_pca_scores.ht'),
                        overwrite=args.overwrite)
        loadings_ht.write(path('platform_pca_loadings.ht'),
                          overwrite=args.overwrite)

    if args.assign_platforms:
        platform_ht = assign_platform_from_pcs(
            hl.read_table(path('platform_pca_scores.ht')),
            hdbscan_min_cluster_size=args.hdbscan_min_cluster_size,
            hdbscan_min_samples=args.hdbscan_min_samples)
        platform_ht.write(f'{output_prefix}.platform_pca_results.ht',
                          overwrite=args.overwrite)

    if args.impute_sex:
        sex_ht = infer_sex(hl.read_matrix_table(path('qc.mt')),
                           hl.read_matrix_table(args.input_mt),
                           hl.read_table(path('platform_pca_results.ht')),
                           args.male_threshold, args.female_threshold,
                           args.min_male_y_sites_called,
                           args.max_y_female_call_rate,
                           args.min_y_male_call_rate)
        sex_ht.write(path('sex.ht'), overwrite=args.overwrite)

    if args.run_pc_relate:
        logger.info('Running PCA for PC-Relate')
        qc_mt = hl.read_matrix_table(path('qc.mt')).unfilter_entries()
        eig, scores, _ = hl.hwe_normalized_pca(qc_mt.GT,
                                               k=10,
                                               compute_loadings=False)
        scores.write(path('pruned.pca_scores.ht'), args.overwrite)

        logger.info('Running PC-Relate')
        logger.warn(
            "PC-relate requires SSDs and doesn't work with preemptible workers!"
        )
        scores = hl.read_table(path('pruned.pca_scores.ht'))
        relatedness_ht = hl.pc_relate(qc_mt.GT,
                                      min_individual_maf=0.05,
                                      scores_expr=scores[qc_mt.col_key].scores,
                                      block_size=4096,
                                      min_kinship=args.min_emission_kinship,
                                      statistics='all')
        relatedness_ht.write(path('relatedness.ht'), args.overwrite)

    if args.filter_dups:
        logger.info("Filtering duplicate samples")
        sample_qc_ht = hl.read_table(path('sample_qc.ht'))
        samples_rankings_ht = sample_qc_ht.select(
            rank=-1 * sample_qc_ht.sample_qc.dp_stats.mean)
        dups_ht = filter_duplicate_samples(
            hl.read_table(path('relatedness.ht')), samples_rankings_ht)
        dups_ht.write(path('duplicates.ht'), overwrite=args.overwrite)

    if args.infer_families:
        logger.info("Inferring families")
        duplicates_ht = hl.read_table(path('duplicates.ht'))
        dups_to_remove = duplicates_ht.aggregate(
            hl.agg.explode(lambda x: hl.agg.collect_as_set(x.s),
                           duplicates_ht.filtered))
        ped = infer_families(hl.read_table(path('relatedness.ht')),
                             hl.read_table(path('sex.ht')), dups_to_remove)
        ped.write(path('pedigree.ped'))

    if args.filter_related_samples:
        logger.info("Filtering related samples")
        related_pairs_ht, related_pairs_tie_breaker = rank_related_samples(
            hl.read_table(path('relatedness.ht')), hl.read_table(args.meta),
            hl.read_table(path('sample_qc.ht')),
            hl.import_fam(path('pedigree.ped'), delimiter="\t"))

        related_samples_to_drop_ht = hl.maximal_independent_set(
            related_pairs_ht.i,
            related_pairs_ht.j,
            keep=False,
            tie_breaker=related_pairs_tie_breaker)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by()
        related_samples_to_drop_ht = related_samples_to_drop_ht.select(
            **related_samples_to_drop_ht.node)
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by('s')
        related_samples_to_drop_ht.write(path('related_samples_to_drop.ht'),
                                         overwrite=args.overwrite)

    if args.run_pca:
        logger.info("Running population PCA")
        pca_evals, pop_pca_scores_ht, pop_pca_loadings_ht = run_pca_with_relateds(
            hl.read_matrix_table(path('qc.mt')),
            hl.read_table(path('related_samples_to_drop.ht')), args.n_pcs)
        pop_pca_loadings_ht.write(path('pop_pca_loadings.ht'), args.overwrite)
        pop_pca_scores_ht.write(path('pop_pca_scores.ht'), args.overwrite)

    if args.assign_pops:
        logger.info("Assigning global population labels")
        pop_pca_scores_ht = hl.read_table(path("pop_pca_scores.ht"))
        gnomad_meta_ht = get_gnomad_meta('exomes').select("pop")[
            pop_pca_scores_ht.key]
        pop_pca_scores_ht = pop_pca_scores_ht.annotate(known_pop=hl.or_missing(
            gnomad_meta_ht.pop != "oth", gnomad_meta_ht.pop))
        pop_ht, pops_rf_model = assign_population_pcs(
            pop_pca_scores_ht,
            pc_cols=pop_pca_scores_ht.scores[:args.n_pcs],
            known_col='known_pop',
            min_prob=args.min_pop_prob)

        pop_ht.write(path('pop.ht'), args.overwrite)
        with hl.hadoop_open(path('pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(pops_rf_model, out)

    if args.assign_subpops:
        qc_mt = hl.read_matrix_table(path('qc.mt'))
        pop_ht = hl.read_table(path('pop.ht'))
        meta_ht = hl.read_table(args.meta)[qc_mt.col_key]
        qc_mt = qc_mt.annotate_cols(pop=pop_ht[qc_mt.col_key].pop,
                                    is_case=meta_ht.is_case,
                                    country=meta_ht.country)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')), threshold=0.01)
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        qc_mt = hl.filter_intervals(qc_mt,
                                    platform_specific_intervals,
                                    keep=False)

        assign_and_write_subpops(
            qc_mt,
            hl.read_table(path('related_samples_to_drop.ht')),
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_pcs,
            min_pop_prob=args.min_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='country',
            include_in_pop_count=qc_mt.is_case)

    if args.run_kgp_pca:
        logger.info("Joining data with 1000 Genomes")
        qc_mt = hl.read_matrix_table(
            path('qc.mt')).select_rows().select_entries("GT")
        qc_mt = qc_mt.select_cols(known_pop=hl.null(hl.tstr),
                                  known_subpop=hl.null(hl.tstr))
        qc_mt = qc_mt.key_cols_by(_kgp=False, *qc_mt.col_key)

        kgp_mt = hl.read_matrix_table(
            kgp_phase3_genotypes_mt_path()).select_rows()
        kgp_mt = kgp_mt.select_cols(known_pop=kgp_mt.super_pops.get(
            kgp_mt.population, "oth").lower(),
                                    known_subpop=kgp_mt.population.lower())
        kgp_mt = kgp_mt.filter_rows(hl.is_defined(
            qc_mt.rows()[kgp_mt.row_key]))
        kgp_mt = filter_rows_for_qc(kgp_mt)
        kgp_mt = kgp_mt.key_cols_by(_kgp=True, *kgp_mt.col_key)

        union_kgp_qc_mt = qc_mt.union_cols(kgp_mt)
        union_kgp_qc_mt.write(path('union_kgp_qc.mt'),
                              overwrite=args.overwrite)

        logger.info("Computing PCA on data with 1000 Genomes")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)
        pca_evals, union_kgp_pca_scores_ht, union_kgp_pca_loadings_ht = run_pca_with_relateds(
            union_kgp_qc_mt, related_samples_to_drop_ht, args.n_kgp_pcs)
        union_kgp_pca_loadings_ht.write(path('union_kgp_pca_loadings.ht'),
                                        args.overwrite)
        union_kgp_pca_scores_ht.write(path('union_kgp_pca_scores.ht'),
                                      args.overwrite)

    if args.assign_pops_kgp:
        logger.info("Assigning populations based on 1000 Genomes labels")
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        union_kgp_pca_scores_ht = hl.read_table(
            path('union_kgp_pca_scores.ht'))
        union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.annotate(
            known_pop=union_kgp_qc_mt[union_kgp_pca_scores_ht.key].known_pop)
        union_kgp_pop_ht, union_kgp_pop_rf_model = assign_population_pcs(
            union_kgp_pca_scores_ht,
            pc_cols=union_kgp_pca_scores_ht.scores[:args.n_kgp_pcs],
            known_col='known_pop',
            min_prob=args.min_kgp_pop_prob)

        union_kgp_pop_ht.write(path('union_kgp_pop.ht'), args.overwrite)

        with hl.hadoop_open(path('union_kgp_pop_rf_model.pkl'), 'wb') as out:
            pickle.dump(union_kgp_pop_rf_model, out)

    if args.assign_subpops_kgp:
        union_kgp_qc_mt = hl.read_matrix_table(path('union_kgp_qc.mt'))
        meta_ht = hl.read_table(args.meta)
        union_kgp_pop_ht = hl.read_table(path('union_kgp_pop.ht'))
        union_kgp_qc_mt = union_kgp_qc_mt.annotate_cols(
            is_case=meta_ht[union_kgp_qc_mt.col_key].is_case,
            pop=union_kgp_pop_ht[union_kgp_qc_mt.col_key].pop)

        platform_specific_intervals = get_platform_specific_intervals(
            hl.read_table(path('platform_pca_loadings.ht')))
        logger.info(
            f'Excluding {len(platform_specific_intervals)} platform-specific intervals for subpop PCA.'
        )
        union_kgp_qc_mt = hl.filter_intervals(union_kgp_qc_mt,
                                              platform_specific_intervals,
                                              keep=False)

        related_samples_to_drop_ht = hl.read_table(
            path('related_samples_to_drop.ht'))
        related_samples_to_drop_ht = related_samples_to_drop_ht.key_by(
            _kgp=False, *related_samples_to_drop_ht.key)

        assign_and_write_subpops(
            union_kgp_qc_mt,
            related_samples_to_drop_ht,
            min_samples_for_subpop=args.min_samples_for_subpop,
            n_pcs=args.n_kgp_pcs,
            min_pop_prob=args.min_kgp_pop_prob,
            overwrite=args.overwrite,
            pop_ann='pop',
            subpop_ann='known_subpop',
            include_in_pop_count=union_kgp_qc_mt.is_case,
            files_prefix='union_kgp_')

    if args.apply_stratified_filters:
        logger.info("Computing stratified QC")
        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            pop_ht = hl.read_table(path('pops.ht'))
            platform_ht = hl.read_table(path('platform_pca_results.ht'))
            sample_qc_ht = sample_qc_ht.annotate(
                qc_pop=pop_ht[sample_qc_ht.key].pop,
                qc_platform=platform_ht[sample_qc_ht.key].qc_platform)
            stratified_metrics_ht = compute_stratified_metrics_filter(
                sample_qc_ht, args.filtering_qc_metrics.split(","),
                ['qc_pop', 'qc_platform'])
            stratified_metrics_ht.write(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'),
                overwrite=args.overwrite)

    if args.write_full_meta:
        logger.info("Writing metadata table")

        # List all tables to join with the base meta
        meta_annotation_hts = [
            hl.read_table(path('platform_pca_results.ht')).rename(
                {'scores': 'platform_pc_scores'}),
            hl.read_table(path('sex.ht')),
            flatten_duplicate_samples_ht(hl.read_table(path('duplicates.ht'))),
            hl.read_table(path('related_samples_to_drop.ht')).select(
                related_filtered=True),
            hl.read_table(path('pca_scores.ht')).rename(
                {'scores': 'pop_pc_scores'}),
            hl.read_table(path('pops.ht')).select('pop'),
            hl.read_table(path('nfe.pca_scores.ht')).rename(
                {'scores': 'nfe_pc_scores'}),
            hl.read_table(path('subpops.nfe.ht')).select('subpop')
        ]

        # union_kgp_pops_ht = hl.read_table(path('union_kgp_pops.ht'))
        # union_kgp_pops_ht = union_kgp_pops_ht.filter(~union_kgp_pops_ht._kgp).key_by('s')
        # union_kgp_pops_ht = union_kgp_pops_ht.select(kgp_pop=union_kgp_pops_ht.pop)
        # meta_annotation_hts.append(union_kgp_pops_ht)
        #
        # union_kgp_pca_scores_ht = hl.read_table(path('union_kgp_pca_scores.ht')).rename({'scores': 'kgp_pop_pc_scores'})
        # union_kgp_pca_scores_ht = union_kgp_pca_scores_ht.filter(~union_kgp_pca_scores_ht._kgp).key_by('s')
        # meta_annotation_hts.append(union_kgp_pca_scores_ht)

        gnomad_meta_ht = get_gnomad_meta('exomes')
        gnomad_meta_ht = gnomad_meta_ht.select(
            gnomad_pop=gnomad_meta_ht.pop, gnomad_subpop=gnomad_meta_ht.subpop)
        meta_annotation_hts.append(gnomad_meta_ht)

        for variant_class_prefix in ['', 'bi_allelic_', 'multi_allelic_']:
            sample_qc_ht = hl.read_table(
                path(f'{variant_class_prefix}sample_qc.ht'))
            stratified_metrics_filters_ht = hl.read_table(
                path(f'{variant_class_prefix}stratified_metrics_filters.ht'))
            if variant_class_prefix:
                sample_qc_ht = sample_qc_ht.rename(
                    {'sample_qc': f'{variant_class_prefix}sample_qc'})
                stratified_metrics_filters_ht = stratified_metrics_filters_ht.rename(
                    {
                        f: f'{variant_class_prefix}{f}'
                        for f in list(stratified_metrics_filters_ht.globals) +
                        list(stratified_metrics_filters_ht.row_value)
                    })
            meta_annotation_hts.extend(
                [sample_qc_ht, stratified_metrics_filters_ht])

        meta_ht = hl.read_table(args.meta)
        meta_ht = meta_ht.annotate_globals(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht.index_globals().items()
            })

        meta_ht = meta_ht.annotate(
            **{
                name: expr
                for ann_ht in meta_annotation_hts
                for name, expr in ann_ht[meta_ht.key].items()
            })

        filtering_col_prefix = '' if args.filtering_variant_class == 'all' else args.filtering_variant_class + "_"
        meta_ht = meta_ht.annotate_globals(
            filtering_variant_class=args.filtering_variant_class)
        meta_ht = meta_ht.annotate(sample_filters=add_filters_expr(
            filters={
                "ambiguous sex": hl.is_missing(meta_ht.is_female),
                'call_rate': meta_ht.sample_qc.call_rate < args.min_call_rate,
                'duplicate': hl.is_defined(meta_ht.dup_filtered)
                & meta_ht.dup_filtered,
                'related': meta_ht.related_filtered
            },
            current_filters=meta_ht[
                f'{filtering_col_prefix}pop_platform_filters']))

        meta_ht.write(path('full_meta.ht'), overwrite=args.overwrite)
Example #27
0
                                  mt.alleles[1]))  #ref (0) to alternates(1)
#Check SNPs: unique possible reference (ref) and alternate allele calls (alt) from entire dataset (all samples)
unique_allelecalls = mt_snp.aggregate_rows(
    hl.struct(ref=hl.agg.collect_as_set(mt_snp.alleles[0]),
              alt=hl.agg.collect_as_set(mt_snp.alleles[1])))
pprint(unique_allelecalls)

#Check SNPs: shows all lenghts of vectors with possible allels (including ref, alternate)
a = mt_snp.aggregate_rows(hl.agg.collect_as_set(hl.len(mt_snp.alleles)))
pprint(a)
mt_AF = mt.filter_rows(mt.variant_qc.AF[1] >= 0.01)

######## 3. QUALITY CONTROL SAMPLES
######## 3.1 Filter samples for outliers more than (6 * SD) from mean (Part 1)
# Calculate sample statistics
mt = hl.sample_qc(mt)
# Calculate statistics on sample statistics
stats_singleton = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_singleton))
stats_ti_tv = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_ti_tv))
stats_het_hom_var = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.r_het_hom_var))
stats_het = mt.aggregate_cols(hl.agg.stats(mt.sample_qc.n_het))

######## 3.2 Sex check on chromosome X (inbreeding coefficient)
# Determine sex from GT calls in sex chromosomes
t = hl.impute_sex(mt.GT)
# Only keep those where genetic sex matches self-reported Sex
mt = mt.filter_cols(t[mt.s].is_female == mt.is_female)

######## 3.3 Check for genetic relationship / "duplicates"
# Calculate identity-by-descent matrix
mt_relatedness = hl.identity_by_descent(mt)
Example #28
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    gtf_ht = hl.experimental.import_gtf(
        GTF_FILE,
        reference_genome='GRCh38',
        skip_invalid_contigs=True,
        min_partitions=12,
    )
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png',
                                    'web')
        if not hl.hadoop_exists(plot_filename):
            p = manhattan_loadings(
                iteration=i,
                gtf=gtf_ht,
                loadings=loadings_ht,
                title=f'Loadings of PC{pc}',
                collect_all=True,
            )
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(p).save(f, format='PNG')
            html = file_html(p, CDN, 'my plot')
            plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
            with hl.hadoop_open(plot_filename_html, 'w') as f:
                f.write(html)

    # Get samples which are driving loadings
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)
    mt = mt.semi_join_cols(scores)
    loadings_ht = loadings_ht.key_by('locus')
    mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings)

    for dim in range(0, number_of_pcs):
        max_value = mt.aggregate_rows(hl.agg.stats(hl.abs(
            mt.loadings[dim]))).max
        significant_variants = mt.filter_rows(
            hl.abs(mt.loadings[dim]) == max_value)
        significant_variants = hl.sample_qc(significant_variants)
        significant_variant_list = significant_variants.locus.collect()
        print(f'PC{dim}:', significant_variant_list)
        heterozygous_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_het > 0).s.collect()
        homozygous_alternate_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_hom_var > 0).s.collect()
        if len(heterozygous_samples) > len(homozygous_alternate_samples):
            homozygous_alternate_samples.extend('null' for _ in range(
                len(heterozygous_samples) - len(homozygous_alternate_samples)))
        elif len(heterozygous_samples) < len(homozygous_alternate_samples):
            heterozygous_samples.extend('null' for _ in range(
                len(homozygous_alternate_samples) - len(heterozygous_samples)))

        # save as html
        html = pd.DataFrame({
            'heterozygous_samples':
            heterozygous_samples,
            'homozygous_alternate_samples':
            homozygous_alternate_samples,
        }).to_html()
        plot_filename_html = output_path(
            f'significant_variants_non_ref_samples{dim}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
Example #29
0
    AMR_AF=mt_split.info.AMR_AF[mt_split.a_index - 1],
    SAS_AF=mt_split.info.SAS_AF[mt_split.a_index - 1],
    DP=mt_split.info.DP,
    AA=mt_split.info.AA,
    VT=(hl.case().when((mt_split.alleles[0].length() == 1)
                       & (mt_split.alleles[1].length() == 1), 'SNP').when(
                           mt_split.alleles[0].matches('<CN*>')
                           | mt_split.alleles[1].matches('<CN*>'),
                           'SV').default('INDEL')),
    EX_TARGET=mt_split.info.EX_TARGET,
    MULTI_ALLELIC=mt_split.info.MULTI_ALLELIC))

n_rows, n_cols = mt_split.count()
n_partitions = mt_split.n_partitions()

mt_split = hl.sample_qc(mt_split)
mt_split = hl.variant_qc(mt_split)

mt_split = mt_split.annotate_globals(
    metadata=hl.struct(name='1000_Genomes_phase3_autosomes',
                       reference_genome='GRCh37',
                       n_rows=n_rows,
                       n_cols=n_cols,
                       n_partitions=n_partitions))

mt_split.write(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt',
    overwrite=True)

mt = hl.read_matrix_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_autosomes.GRCh37.mt')
Example #30
0
    #3. Split multi
    print("3. Split multi")
    mt_split = hl.split_multi_hts(mt_result, keep_star=False)

    # 4. annotate SNPs,indels
    print('Annotating rows with snp and indel info')
    mt = mt_split.annotate_rows(Variant_Type=hl.cond(
        (hl.is_snp(mt_split.alleles[0], mt_split.alleles[1])), "SNP",
        hl.cond(
            hl.is_insertion(mt_split.alleles[0], mt_split.alleles[1]), "INDEL",
            hl.cond(hl.is_deletion(mt_split.alleles[0], mt_split.alleles[1]),
                    "INDEL", "Other"))))

    #4. Sample qc and variant qc
    print("4. Sample qc and variant qc ")
    mt_sampleqc = hl.sample_qc(mt, name='sample_QC_Hail')
    mt2 = hl.variant_qc(mt_sampleqc, name='variant_QC_Hail')

    #5.Annotate COMMON AND RARE VARIANTS to apply separate filters
    print("Annotate COMMON AND RARE VARIANTS to apply separate filters")
    #mt_common = mt_filtered.filter_rows(mt_filtered.variant_qc.AF[1] > 0.05)
    mt2 = mt2.annotate_rows(
        maf=hl.cond(mt2.variant_QC_Hail.AF[1] < 0.01, "< 1%",
                    hl.cond(mt2.variant_QC_Hail.AF[1] < 0.05, "1%-5%", ">5%")))

    #6. Common variants  filtering:
    print("6. Common variants  filtering:")
    mt = mt2
    mt_filtered_variants_common = mt.filter_rows(
        (mt.maf == "< 1%") |  #let all rare variants pass
        ((mt.maf != "< 1%") & ((mt.variant_QC_Hail.p_value_hwe > 10**-5) &
Example #31
0
def sample_qc():
    hl.sample_qc(get_mt()).cols()._force_count()
    locus=variants_to_filter.locus, alleles=variants_to_filter.alleles)

sample_annotations = hl.read_table(PHENOTYPES_TABLE)

mt = hl.read_matrix_table(MT)
mt = mt.filter_rows(hl.is_defined(variants_to_filter[mt.row_key]))
mt = mt.annotate_cols(phenotype=sample_annotations[mt.s])

n = mt.count()

pprint('n samples:')
print(n[1])
pprint('n variants:')
print(n[0])

mt = hl.sample_qc(mt, name='qc_padded_ice')

TARGET_INTERVALS = 'gs://raw_data_bipolar_dalio_w1_w2_hail_02/inputs/ice_coding_v1_targets.interval_list'
# Import the interval lists for the LCRs.
target_intervals = hl.import_locus_intervals(TARGET_INTERVALS,
                                             reference_genome='GRCh38')
mt = mt.annotate_rows(
    not_in_target_intervals=~hl.is_defined(target_intervals[mt.locus]))
mt = mt.filter_rows(mt.not_in_target_intervals, keep=False)

n = mt.count()

pprint('n samples:')
print(n[1])
pprint('n variants:')
print(n[0])