Esempio n. 1
0
def maf_filter(mt, maf, filter_ac0_after_pruning=False):
    """
    Takes matrix table, filters out failing genotypes, variants, and samples, and MAF prunes the
    table, and returns the matrix table

    :param mt: matrix table to prune (should be LD pruned and have x chrom removed).
    :param filter_ac0_after_pruning: filter variants no longer in the data, e.g. sum(AC) = 0?
    :return: returns maf filtered matrix table.
    """

    # Run hl.variant_qc() to get AFs
    mt = hl.variant_qc(mt)

    # Filter MAF
    logging.info(f'Filtering out variants with minor allele frequency < {maf}')
    mt = mt.filter_rows(mt.row.variant_qc.AF[1] > maf, keep=True)
    mt = mt.annotate_globals(maf_threshold_LDpruning=maf)

    if filter_ac0_after_pruning:
        logging.info(
            'Removing variants with alt allele count = 0 (monomorphic variants).'
        )
        mt = hl.variant_qc(mt)
        mt = mt.filter_rows(hl.sum(mt.row.variant_qc.AC) == hl.int(0),
                            keep=False)
        count = mt.count()
        logging.info(
            f"MT count after removing monomorphic variants and MAF filtering: {count}"
        )
    else:
        logging.info("MAF pruned mt count:" + str(mt.count()))

    return mt
Esempio n. 2
0
def variant_and_sample_qc_nested_with_filters_2(mt_path):
    mt = hl.read_matrix_table(mt_path)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.call_rate >= .8)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate >= .8)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.call_rate >= .98)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate >= .98)
    mt.count()
Esempio n. 3
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    eigenvalues_path = f'{output}/eigenvalues_10k.csv'
    scores_path = f'{output}/scores_10k.ht'
    loadings_path = f'{output}/loadings_10k.ht'
    downsampled_mt_path = f'{output}/downsampled_mt.mt'

    # filter out variants with a call rate <0.99 and variants where there
    # is no non-reference allele called.
    mt_qc = hl.variant_qc(mt)
    filt_mt = mt_qc.filter_rows((mt_qc.variant_qc.call_rate >= 0.99)
                                & (mt_qc.variant_qc.n_non_ref >= 1))
    nrows = filt_mt.count_rows()
    # Downsample the dataset to approximately 10k randomly-selected rows
    # (the input must be a proportion)
    downsampled_mt = filt_mt.sample_rows(10000 / nrows, seed=12345)

    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        downsampled_mt.GT, compute_loadings=True, k=20)
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores, loadings, and downsampled matrix table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
    downsampled_mt.write(downsampled_mt_path, overwrite=True)
Esempio n. 4
0
def main(args):
    # Start Hail
    hl.init(default_reference=args.default_ref_genome)

    # Read Hail MatrixTable
    mt = hl.read_matrix_table(args.mt_input_path)

    # compute sample and variant qc
    mt = hl.variant_qc(mt)

    # write variant qc hailtable
    tb_variant_qc = (mt
                     .select_rows('variant_qc')
                     .rows()
                     .flatten()
                     .key_by('locus', 'alleles')
                     )
    output_path_ht = f'{args.ht_output_path}_variant_qc.ht'
    tb_variant_qc.write(output=output_path_ht)

    if args.write_to_file:
        (hl.read_table(output_path_ht)
         .export(f'{output_path_ht}_variant_qc.tsv.bgz')
         )

    # Stop Hail
    hl.stop()

    print("Finished!")
def calclulate_hail_variant_qc(mt: hl.MatrixTable) -> hl.MatrixTable:
    '''
    Compute variant qc metrics
    :param mt: the original matrixtable
    :return: annotated matrixtable with variant_qc struct
    '''
    mt_with_variantqc = hl.variant_qc(mt, name='variant_qc')
    return mt_with_variantqc
Esempio n. 6
0
    def test_sample_and_variant_qc_call_rate(self):
        mt = hl.import_vcf(resource('sample.vcf'))

        n_rows, n_cols = mt.count()
        mt = mt.filter_entries(mt.GQ > 5)
        mt = hl.variant_qc(hl.sample_qc(mt))

        assert mt.aggregate_cols(hl.agg.all(hl.approx_equal(mt.sample_qc.call_rate, mt.sample_qc.n_called / n_rows)))
        assert mt.aggregate_rows(hl.agg.all(hl.approx_equal(mt.variant_qc.call_rate, mt.variant_qc.n_called / n_cols)))
Esempio n. 7
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Esempio n. 8
0
    def test_variant_qc(self):
        data = [
            {'v': '1:1:A:T', 's': '1', 'GT': hl.Call([0, 0]), 'GQ': 10, 'DP': 0},
            {'v': '1:1:A:T', 's': '2', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:1:A:T', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 11, 'DP': 100},
            {'v': '1:1:A:T', 's': '4', 'GT': None, 'GQ': None, 'DP': 100},
            {'v': '1:2:A:T,C', 's': '1', 'GT': hl.Call([1, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '2', 'GT': hl.Call([2, 2]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '3', 'GT': hl.Call([0, 1]), 'GQ': 10, 'DP': 5},
            {'v': '1:2:A:T,C', 's': '4', 'GT': hl.Call([1, 1]), 'GQ': 10, 'DP': 5},
        ]

        ht = hl.Table.parallelize(data, hl.dtype('struct{v: str, s: str, GT: call, GQ: int, DP: int}'))
        ht = ht.transmute(**hl.parse_variant(ht.v))
        mt = ht.to_matrix_table(['locus', 'alleles'], ['s'])
        mt = hl.variant_qc(mt, 'vqc')
        r = mt.rows().collect()

        self.assertEqual(r[0].vqc.AF, [0.5, 0.5])
        self.assertEqual(r[0].vqc.AC, [3, 3])
        self.assertEqual(r[0].vqc.AN, 6)
        self.assertEqual(r[0].vqc.homozygote_count, [1, 1])
        self.assertEqual(r[0].vqc.n_called, 3)
        self.assertEqual(r[0].vqc.n_not_called, 1)
        self.assertEqual(r[0].vqc.call_rate, 0.75)
        self.assertEqual(r[0].vqc.n_het, 1)
        self.assertEqual(r[0].vqc.n_non_ref, 2)
        self.assertEqual(r[0].vqc.het_freq_hwe, 0.6)
        self.assertEqual(r[0].vqc.p_value_hwe, 0.7)
        self.assertEqual(r[0].vqc.dp_stats.min, 0)
        self.assertEqual(r[0].vqc.dp_stats.max, 100)
        self.assertEqual(r[0].vqc.dp_stats.mean, 51.25)
        self.assertAlmostEqual(r[0].vqc.dp_stats.stdev, 48.782040752719645)
        self.assertEqual(r[0].vqc.gq_stats.min, 10)
        self.assertEqual(r[0].vqc.gq_stats.max, 11)
        self.assertAlmostEqual(r[0].vqc.gq_stats.mean, 10.333333333333334)
        self.assertAlmostEqual(r[0].vqc.gq_stats.stdev, 0.47140452079103168)

        self.assertEqual(r[1].vqc.AF, [0.125, 0.5, 0.375])
        self.assertEqual(r[1].vqc.AC, [1, 4, 3])
        self.assertEqual(r[1].vqc.AN, 8)
        self.assertEqual(r[1].vqc.homozygote_count, [0, 1, 1])
        self.assertEqual(r[1].vqc.n_called, 4)
        self.assertEqual(r[1].vqc.n_not_called, 0)
        self.assertEqual(r[1].vqc.call_rate, 1.0)
        self.assertEqual(r[1].vqc.n_het, 2)
        self.assertEqual(r[1].vqc.n_non_ref, 4)
        self.assertEqual(r[1].vqc.p_value_hwe, None)
        self.assertEqual(r[1].vqc.het_freq_hwe, None)
        self.assertEqual(r[1].vqc.dp_stats.min, 5)
        self.assertEqual(r[1].vqc.dp_stats.max, 5)
        self.assertEqual(r[1].vqc.dp_stats.mean, 5)
        self.assertEqual(r[1].vqc.dp_stats.stdev, 0.0)
        self.assertEqual(r[1].vqc.gq_stats.min, 10)
        self.assertEqual(r[1].vqc.gq_stats.max, 10)
        self.assertEqual(r[1].vqc.gq_stats.mean, 10)
        self.assertEqual(r[1].vqc.gq_stats.stdev, 0)
Esempio n. 9
0
def compute_qc_metrics(mt: hl.MatrixTable) -> hl.MatrixTable:
    """
    Compute per-sample metrics and common variant statistics useful for quality control
    :param mt: Hail MatrixTable
    :return: Hail MatrixTable with variant and sample qc metrics
    """
    mt = hl.variant_qc(mt)
    mt = hl.sample_qc(mt)

    return mt
Esempio n. 10
0
def genetics_pipeline():
    mt = get_mt()
    mt = hl.split_multi_hts(mt)
    mt = hl.variant_qc(mt)
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols(mt.sample_qc.call_rate > 0.95)
    mt = mt.filter_rows(mt.variant_qc.AC[1] > 5)
    mt = mt.filter_entries(hl.case().when(
        hl.is_indel(mt.alleles[0], mt.alleles[1]),
        mt.GQ > 20).default(mt.GQ > 10))
    mt.write('/tmp/genetics_pipeline.mt', overwrite=True)
def ld_prune_filter(intersect_out, prune_out, overwrite: bool = False):
    mt = hl.read_matrix_table(intersect_out)
    print(mt.count())
    mt = hl.variant_qc(mt)
    mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.001)
                             & (mt.variant_qc.AF[0] < 0.999))
    print(mt_filt.count())

    mt_intersect_prune = hl.ld_prune(mt_filt.GT, r2=0.8, bp_window_size=500000)
    mt_intersect_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_intersect_prune[mt_filt.row_key]))
    mt_intersect_pruned.write(prune_out, overwrite)
Esempio n. 12
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = hl.variant_qc(tob_wgs)
    # get MAF > 0.05
    tob_wgs = tob_wgs.filter_rows(tob_wgs.variant_qc.AF[0] < 1)
    snp_maf_05 = tob_wgs.aggregate_rows(
        hl.agg.count_where(tob_wgs.variant_qc.AF[1] > 0.05))
    print(f'Variant MAF > 0.05 = {snp_maf_05}')
Esempio n. 13
0
def compute_qc(mt: hl.MatrixTable,
               root_col_name='sample_qc',
               root_row_name='variant_qc') -> hl.MatrixTable:
    """
    Given a MatrixTable, compute samples/variants quality controls metrics

    :param mt: Input MatrixTable
    :param root_col_name: prefix sample qc field
    :param root_row_name: prefix variant qc field
    :return: MatrixTable with quality control computed
    """
    mt = hl.sample_qc(mt, name=root_col_name)
    mt = hl.variant_qc(mt, name=root_row_name)
    return mt
Esempio n. 14
0
def filter_snps(mt, maf):
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt.filter_rows(mt.maf > maf)

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt = hl.filter_intervals(mt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                             keep=False)

    return mt
Esempio n. 15
0
def pca_filter_mt(in_mt: hl.MatrixTable,
                  maf: float = 0.05,
                  hwe: float = 1e-3,
                  call_rate: float = 0.98,
                  ld_cor: float = 0.2,
                  ld_window: int = 250000):

    print("\nInitial number of SNPs before filtering: {}".format(
        in_mt.count_rows()))
    mt = hl.variant_qc(in_mt)
    print(f'\nFiltering out variants with MAF < {maf}')
    mt_filt = mt.annotate_rows(maf=hl.min(mt.variant_qc.AF))
    mt_filt = mt_filt.filter_rows(mt_filt.maf > maf)

    print(f'\nFiltering out variants with HWE < {hwe:1e}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.p_value_hwe > hwe)

    print(f'\nFiltering out variants with Call Rate < {call_rate}')
    mt_filt = mt_filt.filter_rows(mt_filt.variant_qc.call_rate >= call_rate)

    # no strand ambiguity
    print('\nFiltering out strand ambigous variants')
    mt_filt = mt_filt.filter_rows(
        ~hl.is_strand_ambiguous(mt_filt.alleles[0], mt_filt.alleles[1]))

    # MHC chr6:25-35Mb
    # chr8.inversion chr8:7-13Mb
    print(
        '\nFiltering out variants in MHC [chr6:25M-35M] and chromosome 8 inversions [chr8:7M-13M]'
    )
    intervals = ['chr6:25M-35M', 'chr8:7M-13M']
    mt_filt = hl.filter_intervals(mt_filt, [
        hl.parse_locus_interval(x, reference_genome='GRCh38')
        for x in intervals
    ],
                                  keep=False)

    # This step is expensive (on local machine)
    print(
        f'\nLD pruning using correlation threshold of {ld_cor} and window size of {ld_window}'
    )
    mt_ld_prune = hl.ld_prune(mt_filt.GT, r2=ld_cor, bp_window_size=ld_window)
    mt_ld_pruned = mt_filt.filter_rows(
        hl.is_defined(mt_ld_prune[mt_filt.row_key]))
    print("\nNumber of SNPs after filtering: {}".format(
        mt_ld_pruned.count_rows()))

    return mt_ld_pruned
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in both matrix tables after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])

    # choose variants based off of gnomAD v3 parameters
    hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows(
        IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT,
                             hgdp1kg_tobwgs_joined.variant_qc.AF[1]))
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2)
        & (hgdp1kg_tobwgs_joined.locus.in_autosome())
        & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01)
        & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99)
        & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25))

    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache()
    nrows = hgdp1kg_tobwgs_joined.count_rows()
    print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}')
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows(
        NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345)

    pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT,
                                       r2=0.1,
                                       bp_window_size=500000)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key]))
    mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt'
    hgdp1kg_tobwgs_joined.write(mt_path)
Esempio n. 17
0
def ld_prune_filter(mt: hl.MatrixTable, mt_ld: str, overwrite: bool = False):
    """
    Runs variant QC and filters out rare variants, those with missingness, and LD prunes to independent variants
    :param mt: Matrix table to run variant QC on and filter variants from
    :param mt_ld: Path to write intermediate filtered mt
    :param overwrite: if True, overwrites existing data
    :return:
    """
    mt.describe()
    mt = hl.variant_qc(mt)
    # mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.01) & (mt.variant_qc.AF[0] < 0.99))
    mt_filt = mt.filter_rows((mt.variant_qc.AF[0] > 0.05)
                             & (mt.variant_qc.AF[0] < 0.95)
                             & (mt.variant_qc.call_rate > 0.999))

    # pruned = hl.ld_prune(mt_filt.GT, r2=0.2, bp_window_size=500000)
    pruned = hl.ld_prune(mt_filt.GT, r2=0.1, bp_window_size=500000)
    mt_filt = mt_filt.filter_rows(hl.is_defined(pruned[mt_filt.row_key]))
    mt_filt.write(mt_ld, overwrite)
Esempio n. 18
0
def query(output):
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt_path = f'{output}/filtered_mt.mt'
    mt = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    # reproduce gnomAD genotype filtering
    mt = annotate_adj(mt)
    mt = mt.filter_entries(mt.adj)
    mt = hl.variant_qc(mt)
    # Filter to common and biallelic variants
    mt = mt.filter_rows((hl.len(mt.alleles) == 2)
                        & (mt.variant_qc.AF[1] > 0.05))
    pruned_variant_table = hl.ld_prune(mt.GT, r2=0.2, bp_window_size=500000)
    filtered_mt = mt.filter_rows(
        hl.is_defined(pruned_variant_table[mt.row_key]))
    # save filtered mt table
    filtered_mt.write(mt_path, overwrite=True)
def load_files(file_prefix, overwrite, gencove, mt):
    """
    loads VCFs, run sample QC and variant QC, writes matrix table
    :param file_prefix:
    :param overwrite:
    :return:
    """
    if gencove:
        ngap_downsample = hl.read_matrix_table(file_prefix + '_grch38.mt')
    else:
        ngap_downsample = hl.import_vcf(file_prefix + '.vcf.gz',
                                        force_bgz=True,
                                        reference_genome='GRCh38',
                                        min_partitions=200)
        ngap_downsample = hl.split_multi_hts(ngap_downsample)
    ngap_downsample = ngap_downsample.filter_cols(
        (ngap_downsample.s != 'NGE0018') & (ngap_downsample.s != 'NGE0130'))
    ngap_sample_qc = hl.sample_qc(ngap_downsample)
    ngap_sample_variant_qc = hl.variant_qc(ngap_sample_qc)
    ngap_sample_variant_qc.write(file_prefix + '.mt', overwrite=overwrite)
Esempio n. 20
0
def compute_kinship_ht(mt, genome_version="GRCh38"):

    mt = filter_to_biallelics(mt)
    mt = filter_to_autosomes(mt)
    mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))

    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.call_rate > 0.99)
    #mt = mt.filter_rows(mt.info.AF > 0.001) # leaves 100% of variants

    mt = ld_prune(mt, genome_version=genome_version)

    ibd_results_ht = hl.identity_by_descent(mt,
                                            maf=mt.info.AF,
                                            min=0.10,
                                            max=1.0)
    ibd_results_ht = ibd_results_ht.annotate(
        ibd0=ibd_results_ht.ibd.Z0,
        ibd1=ibd_results_ht.ibd.Z1,
        ibd2=ibd_results_ht.ibd.Z2,
        pi_hat=ibd_results_ht.ibd.PI_HAT).drop("ibs0", "ibs1", "ibs2", "ibd")

    kin_ht = ibd_results_ht

    # filter to anything above the relationship of a grandparent
    first_degree_pi_hat = .40
    grandparent_pi_hat = .20
    grandparent_ibd1 = 0.25
    grandparent_ibd2 = 0.15

    kin_ht = kin_ht.key_by("i", "j")
    kin_ht = kin_ht.filter((kin_ht.pi_hat > first_degree_pi_hat) | (
        (kin_ht.pi_hat > grandparent_pi_hat) & (kin_ht.ibd1 > grandparent_ibd1)
        & (kin_ht.ibd2 < grandparent_ibd2)))

    kin_ht = kin_ht.annotate(relation=hl.sorted([kin_ht.i, kin_ht.j
                                                 ]))  #better variable name

    return kin_ht
Esempio n. 21
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])
    mt = hl.sample_qc(mt)
    mt = mt.filter_cols((mt.sample_qc.dp_stats.mean >= 4)
                        & (mt.sample_qc.call_rate >= 0.97))
    ab = mt.AD[1] / hl.sum(mt.AD)
    filter_condition_ab = ((mt.GT.is_hom_ref() & (ab <= 0.1))
                           | (mt.GT.is_het() & (ab >= 0.25) & (ab <= 0.75))
                           | (mt.GT.is_hom_var() & (ab >= 0.9)))
    mt = mt.filter_entries(filter_condition_ab)
    mt = hl.variant_qc(mt)
    mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)

    eigenvalues, pcs, _ = hl.hwe_normalized_pca(mt.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(y=mt.pheno.CaffeineConsumption,
                                     x=mt.GT.n_alt_alleles(),
                                     covariates=[
                                         1.0, mt.pheno.isFemale, mt.scores[0],
                                         mt.scores[1], mt.scores[2]
                                     ])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Esempio n. 22
0
mt_split = hl.split_multi(mt)
mt_split = mt_split.select_entries(
    GT=hl.downcode(mt_split.GT, mt_split.a_index))
mt_split = mt_split.annotate_rows(info=hl.struct(
    AC=mt_split.info.AC[mt_split.a_index - 1],
    VT=(hl.case().when((mt_split.alleles[0].length() == 1) & (
        mt_split.alleles[1].length() == 1), 'SNP').when(
            mt_split.alleles[0].matches('<CN*>')
            | mt_split.alleles[1].matches('<CN*>'), 'SV').default('INDEL'))))

n_rows, n_cols = mt_split.count()
n_partitions = mt_split.n_partitions()

mt_split = hl.sample_qc(mt_split)
mt_split = hl.variant_qc(mt_split)

mt_split = mt_split.annotate_globals(
    metadata=hl.struct(name='1000_Genomes_phase3_chrMT',
                       reference_genome='GRCh37',
                       n_rows=n_rows,
                       n_cols=n_cols,
                       n_partitions=n_partitions))

mt_split.write(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_chrMT.GRCh37.mt',
    overwrite=True)

mt = hl.read_matrix_table(
    'gs://hail-datasets-hail-data/1000_Genomes_phase3_chrMT.GRCh37.mt')
mt.describe()
Esempio n. 23
0
def relatedness_check(in_mt: hl.MatrixTable = None,
                      method: str = 'pc_relate',
                      outdir: str = None,
                      kin_estimate: float = 0.98):

    global mt, samples_to_remove

    in_mt = hl.variant_qc(in_mt)
    in_mt = hl.sample_qc(in_mt)

    # _localize=False means don't put this in Python, keep it as a Hail expr
    call_rate_dict = in_mt.aggregate_cols(hl.dict(
        hl.agg.collect((in_mt.s, in_mt.sample_qc.call_rate))),
                                          _localize=False)

    if method == 'pc_relate':
        print("\nUsing PC-Relate for relatedness checks")
        relatedness_ht = hl.pc_relate(in_mt.GT,
                                      0.01,
                                      k=10,
                                      min_kinship=0.1,
                                      statistics='kin')
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.kin > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i.s],
            cr_s2=call_rate_dict[samples_to_remove_ht.j.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    elif method == 'ibd':
        print("\nUsing PLINK-style identity by descent for relatedness checks")
        in_mt = in_mt.annotate_rows(maf=hl.min(in_mt.variant_qc.AF))
        relatedness_ht = hl.identity_by_descent(
            in_mt, maf=in_mt['maf']
        )  # this returns a Hail Table with the sample pairs
        samples_to_remove_ht = relatedness_ht.filter(
            relatedness_ht.ibd.PI_HAT > kin_estimate)

        # get call rates for both samples so we remove the one with lower call rate between the two
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.i],
            cr_s2=call_rate_dict[samples_to_remove_ht.j])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.i, samples_to_remove.j))

    else:
        print("\nUsing KING for relatedness checks")
        if kin_estimate > 0.5:
            raise Exception(
                "\nThe maximum kinship coefficient is for KING 0.5")
        relatedness_mt = hl.king(in_mt.GT)
        filtered_relatedness_mt = relatedness_mt.filter_entries(
            (relatedness_mt.s_1 != relatedness_mt.s) &
            (relatedness_mt.phi >= kin_estimate),
            keep=True)
        samples_to_remove_ht = filtered_relatedness_mt.entries()
        samples_to_remove = samples_to_remove_ht.annotate(
            cr_s1=call_rate_dict[samples_to_remove_ht.s_1],
            cr_s2=call_rate_dict[samples_to_remove_ht.s])

        samples_list = samples_to_remove.annotate(sample_to_remove=hl.cond(
            samples_to_remove.cr_s1 <= samples_to_remove.cr_s2,
            samples_to_remove.s_1, samples_to_remove.s))

    samples = samples_list.sample_to_remove.collect()

    if len(samples) > 0:
        in_mt = in_mt.filter_cols(hl.literal(samples).contains(in_mt['s']),
                                  keep=False)
        print("\nNumber of samples that fail relatedness checks: {}".format(
            len(samples)))
        with open(outdir + 'relatedness_removed_samples.tsv', 'w') as f:
            for sample in samples:
                f.write(sample + "\n")

    else:
        print("\nNo samples failed the relatedness check")

    return in_mt
Esempio n. 24
0
 def get_data(a2_reference):
     mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim',
                                   bfile + '.fam', a2_reference=a2_reference)
     return (hl.variant_qc(mt_imported)
             .rows()
             .key_by('rsid'))
Esempio n. 25
0
##
# Main script
#
logger.info(f"Reading pedigree file {args.fam}")
pedigree = hl.Pedigree.read(args.fam)

logger.info(f"Importing vcf file {args.vcf}")
data = hl.import_vcf(args.vcf,
                     call_fields=['GT'],
                     skip_invalid_loci=True,
                     force_bgz=True)
data = hl.split_multi_hts(data)
data = data.annotate_rows(AC=data.info.AC[data.a_index - 1],
                          iAF=data.info.AF[data.a_index - 1])
data = hl.variant_qc(data)

logger.info("Applying de novo filter...")
de_novo_scores = hl.de_novo(data,
                            pedigree,
                            pop_frequency_prior=data.variant_qc.AF[-1])
de_novo_mt = de_novo_scores.to_matrix_table(row_key=['locus', 'alleles'],
                                            col_key=['id'])
de_novo_data = data.annotate_entries(p_de_novo=de_novo_mt[(data.locus,
                                                           data.alleles),
                                                          data.s].p_de_novo)

logger.info("Annotating trio data...")
trio_mt = hl.trio_matrix(de_novo_data, pedigree, complete_trios=True)
de_novo_data = de_novo_data.annotate_entries(
    mother=trio_mt[(de_novo_data.locus, de_novo_data.alleles),
Esempio n. 26
0
                       filter='\d/\d/\d',
                       skip_invalid_loci=True,
                       force_bgz=True,
                       reference_genome='GRCh38',
                       contig_recoding=recoding_dict)
    ### filter to pass variants and split_multi
    mt2 = mt.filter_rows(mt.filters.size() > 0, keep=False)
    mt2 = hl.split_multi_hts(mt2)

    #variant read counts of 3
    #at least one read in both forward and reverse orientations
    #remove  monomorphic variants
    mt3 = mt2.filter_entries(
        ((mt2.AD[1] < 2) | (mt2.F1R2[1] == 0) | (mt2.F2R1[1] == 0)),
        keep=False)
    mt3 = hl.variant_qc(mt3)
    mt3 = mt3.filter_rows(
        (mt3.variant_qc.AF[1] > 0) & (mt3.variant_qc.AF[1] < 1), keep=True)

    mt4 = mt3.annotate_rows(v = hl.variant_str(mt3.locus, mt3.alleles),\
         NumAltAlleles = hl.agg.max(mt3.GT.n_alt_alleles()), \
         VAF =hl.agg.explode(lambda x: hl.agg.mean(x), mt3.AF),\
         TLOD =mt3.info.TLOD[0], \
         GERMQ = mt3.info.GERMQ, \
         STR=mt3.info.STR,\
         AD_alt=hl.agg.mean(mt3.AD[1]),\
         AD_ref=hl.agg.mean(mt3.AD[0]))

    mt4 = mt4.annotate_entries(
        Binomial_Prob=hl.binom_test(mt4.AD[1], mt4.DP, 0.5, 'greater'))
    mt4 = mt4.key_rows_by("v")
Esempio n. 27
0
def main():

    # # Args (local)
    # chrom = 11
    # chain_file = '/Users/em21/Projects/ot_genetics/genetics-sumstats_data/extras/prepare_uk_biobank_gwas_catalog/sitelist/input_data/grch37_to_grch38.over.chain.gz'
    # in_bgen = 'example_data/ukb_imp_chr{chrom}_v3.example.bgen'
    # in_sample = 'output/ukb_10k_downsampled.sample'
    # to_keep_list = 'output/ukb_10k_downsampled.sample_list.tsv'
    # out_plink = 'output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    # cores = 1 # Use "*" for all
    # maf_threshold = 0.001

    # Args (server)
    chrom = sys.argv[1]
    chain_file = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/grch37_to_grch38.over.chain.gz'
    in_bgen = '/nfs/users/nfs_e/em21/otcoregen/uk_biobank_data/data/genetics/imputation/ukb_imp_chr{chrom}_v3.bgen'
    in_sample = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample'
    to_keep_list = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/input_data/ukb_10k_downsampled.sample_list.tsv'
    out_plink = '/nfs/users/nfs_e/em21/otcoregen/em21/uk_biobank_analysis/create_10k_subsample/output/ukb_v3_downsampled10k_plink/ukb_v3_chr{chrom}.downsampled10k'
    cores = sys.argv[2]  # Use "*" for all
    maf_threshold = 0.001

    # Set the maximum number of cores
    hl.init(master="local[{}]".format(cores))

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(chain_file, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    print('Processing chromosome {0}'.format(chrom))

    # Index bgen if not existing
    if not hl.hadoop_exists(in_bgen.format(chrom=chrom) + '.idx2'):
        hl.index_bgen(in_bgen.format(chrom=chrom),
                      contig_recoding={
                          "01": "1",
                          "02": "2",
                          "03": "3",
                          "04": "4",
                          "05": "5",
                          "06": "6",
                          "07": "7",
                          "08": "8",
                          "09": "9"
                      },
                      reference_genome='GRCh37')

    # Load bgen
    mt = hl.import_bgen(in_bgen.format(chrom=chrom),
                        entry_fields=['GT'],
                        sample_file=in_sample)

    # Load list samples to keep
    samples_to_keep = hl.import_table(to_keep_list,
                                      no_header=True,
                                      impute=False,
                                      types={
                                          'f0': hl.tstr
                                      }).key_by('f0')

    # Downsample to required subset of samples
    mt = mt.filter_cols(hl.is_defined(samples_to_keep[mt.s]))

    # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Filter on MAF
    mt = hl.variant_qc(mt)
    mt = mt.annotate_rows(variant_qc=mt.variant_qc.annotate(
        MAF=hl.min(mt.variant_qc.AF)))
    mt = mt.filter_rows(mt.variant_qc.MAF >= maf_threshold)

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=out_plink.format(chrom=chrom))

    return 0
Esempio n. 28
0
files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
for f in files:
    if os.path.isdir(f):
        shutil.rmtree(f)

ds = hl.import_vcf('data/sample.vcf.bgz')
ds = ds.sample_rows(0.03)
ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                      panel_maf=0.1,
                      anno1=5,
                      anno2=0,
                      consequence="LOF",
                      gene="A",
                      score=5.0)
ds = ds.annotate_rows(a_index=1)
ds = hl.sample_qc(hl.variant_qc(ds))
ds = ds.annotate_cols(is_case=True,
                      pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                      is_female=hl.rand_bool(0.5),
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
Esempio n. 29
0
def variant_qc():
    hl.variant_qc(get_mt()).rows()._force_count()
Esempio n. 30
0
hl.init(default_reference='GRCh37')
​
## Variant level annotations (VEP annotations; annotated separately)
mt5 = hl.read_table('gs://ukbb_v2/projects/mzekavat/ukbb_v3.AllAutosomalANDchrX.annotations.ht')
## UKBB imputed bgens:
ds = hl.import_bgen('gs://fc-7d5088b4-7673-45b5-95c2-17ae00a04183/imputed/ukb_imp_chr{1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22}_v3.bgen',entry_fields = ['GT'],sample_file='gs://ukbb_v2/data/ukb7089_imp_chr3_v3_s487395.sample')
## Phenotype file
phenos = hl.import_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/ukbb_PhenoFile.ALL_500k_incidPrevCases.plusRespPhenos.plusBPMeds.plusPFTs.plusCHIP.QCed.txt.gz',force_bgz=True,key = 'id',types={'id':hl.tstr},impute=True)

ds = ds.annotate_rows(**mt5.index(ds.row_key))
ds = ds.annotate_cols(pheno = phenos[ds.col_key])
ds = ds.annotate_cols(array = hl.if_else((ds.pheno.genotyping_array == "UKBB"), 1, 0))
ds = ds.filter_cols(hl.is_defined(ds.pheno.age), keep=True)

### variant qc 
mt = hl.variant_qc(ds,name='variant_qc')
mt = mt.filter_rows( ((mt.variant_qc.AF[1] > 0.001) & (mt.variant_qc.AF[1] < 0.999) & (mt.info>0.4) & (mt.variant_qc.p_value_hwe >= 0.0000000001)),keep = True )
final= mt.annotate_rows(AF = mt.variant_qc.AF[1],AC = mt.variant_qc.AC[1],AN = mt.variant_qc.AN)
#final_annot = final.annotate_rows(HWE = final.variant_qc.p_value_hwe, callRate = final.variant_qc.call_rate)
#final_annot = final_annot.drop('variant_qc').rows()
### gwas logistic regression wald
gwas = hl.logistic_regression_rows(test='wald',\
									y=final.pheno.All_Pneumonia,\
									x=final.GT.n_alt_alleles(),\
									covariates=[1, final.pheno.age,final.pheno.age2, final.pheno.Sex_numeric, final.pheno.ever_smoked, final.pheno.PC1,final.pheno.PC2,final.pheno.PC3,final.pheno.PC4,final.pheno.PC5,final.pheno.PC6,final.pheno.PC7,final.pheno.PC8,final.pheno.PC9,final.pheno.PC10,final.array],
									pass_through=['rsid','Gene','Consequence','clin_sig', 'metasvm','LOF_LOFTEE','PolyPhen','SIFT','hgvsp','AF', 'AC', 'AN','info'])
​
### Writting out the annotated GWAS results:
gwas.flatten().export('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.tsv.bgz')
gwas.write('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht')
gwas = hl.read_table('gs://ukbb_v2/projects/mzekavat/Pneumonia_GWAS/logreg_wald_All_Pneumonia.ht')
Esempio n. 31
0
sample_annotations = hl.read_table(PHENOTYPES_TABLE)
impute_sex_annotations = hl.read_table(IMPUTESEX_TABLE)
annotation_annotations = hl.read_table(ANNOTATION_TABLE)

mt = hl.read_matrix_table(MT)
mt = mt.drop('a_index', 'qual', 'info', 'filters', 'was_split')

mt = mt.filter_cols(hl.is_defined(ht_final_samples[mt.col_key]))
mt = mt.filter_rows(hl.is_defined(ht_final_variants[mt.row_key]))

mt = mt.annotate_cols(phenotype=sample_annotations[mt.col_key])
mt = mt.annotate_cols(imputesex=impute_sex_annotations[mt.col_key])
mt = mt.annotate_rows(annotation=annotation_annotations[mt.row_key])

mt = hl.variant_qc(mt, name='qc')

mt = mt.annotate_rows(qc=mt.qc.annotate(p_value_hwe=hl.case().when(
    mt.locus.in_autosome(), mt.qc.het_freq_hwe).default(
        hl.agg.filter(mt.imputesex.impute_sex.is_female,
                      hl.agg.hardy_weinberg_test(mt.GT).het_freq_hwe))))

mt = mt.annotate_rows(annotation=mt.annotation.annotate(
    info=mt.annotation.info.annotate(
        AC=mt.annotation.info.AC[mt.annotation.a_index - 1],
        AF=mt.annotation.info.AF[mt.annotation.a_index - 1],
    )))

mt = hl.sample_qc(mt)

mt_pca = mt.filter_rows(hl.is_defined(ht_final_pruned_variants[mt.row_key]))
Esempio n. 32
0
def variant_and_sample_qc():
    mt = get_mt()
    hl.sample_qc(hl.variant_qc(mt))._force_count_rows()
Esempio n. 33
0
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
import gcsfs
fs = gcsfs.GCSFileSystem(project='your-project')
bucket = client.get_bucket('your-bucket')

import hail as hl
import hail.expr.aggregators as agg
hl.init()

#read mt file
mt = hl.read_matrix_table(
    "gs://1k_genome/1000-genomes/VDS-of-all/ALL.chr.integrated_phase1_v3.20101123.snps_indels_svs.genotypes.mt"
)
#print(mt.count()) (39706715, 1092)

#filter MAF
mt = hl.variant_qc(mt)
mt = mt.filter_rows(mt.variant_qc.AF[1] > 0.01)
#print(mt.count()) (13404583, 1092)

#filter only SNPs
mt = mt.filter_rows(hl.is_snp(mt.alleles[0], mt.alleles[1]))
#print(mt.count()) (12194564, 1092)

#annotate MT file
table = (hl.import_table('gs://ines-work/KG-annotation-with-sexencoder.csv',
                         delimiter=',',
                         missing='',
                         quote='"',
                         types={
                             'Gender_Classification': hl.tfloat64
                         }).key_by('Sample'))
Esempio n. 35
0
def generate_datasets(doctest_namespace):
    doctest_namespace['hl'] = hl
    doctest_namespace['np'] = np

    ds = hl.import_vcf('data/sample.vcf.bgz')
    ds = ds.sample_rows(0.03)
    ds = ds.annotate_rows(use_as_marker=hl.rand_bool(0.5),
                          panel_maf=0.1,
                          anno1=5,
                          anno2=0,
                          consequence="LOF",
                          gene="A",
                          score=5.0)
    ds = ds.annotate_rows(a_index=1)
    ds = hl.sample_qc(hl.variant_qc(ds))
    ds = ds.annotate_cols(is_case=True,
                          pheno=hl.struct(is_case=hl.rand_bool(0.5),
                                          is_female=hl.rand_bool(0.5),
                                          age=hl.rand_norm(65, 10),
                                          height=hl.rand_norm(70, 10),
                                          blood_pressure=hl.rand_norm(120, 20),
                                          cohort_name="cohort1"),
                          cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                          cov1=hl.rand_norm(0, 1),
                          cov2=hl.rand_norm(0, 1),
                          cohort="SIGMA")
    ds = ds.annotate_globals(
        global_field_1=5,
        global_field_2=10,
        pli={
            'SCN1A': 0.999,
            'SONIC': 0.014
        },
        populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
    ds = ds.annotate_rows(gene=['TTN'])
    ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
    ds = ds.checkpoint(f'output/example.mt', overwrite=True)

    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(
        consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata
    doctest_namespace['cols_to_keep'] = s_metadata
    doctest_namespace['cols_to_remove'] = s_metadata
    doctest_namespace['rows_to_keep'] = v_metadata
    doctest_namespace['rows_to_remove'] = v_metadata

    small_mt = hl.balding_nichols_model(3, 4, 4)
    doctest_namespace['small_mt'] = small_mt.checkpoint('output/small.mt',
                                                        overwrite=True)

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv',
                             impute=True,
                             types={
                                 'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                 'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                 'E': hl.tstruct(A=hl.tint32, B=hl.tint32)
                             })
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv',
                                   delimiter='\\s+',
                                   types={
                                       'Age': hl.tint32,
                                       'Children': hl.tarray(hl.tstr)
                                   },
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({
        'Alice': 43,
        'Bob': 33,
        'Charles': 44
    })
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval(
        "1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)
    doctest_namespace['nd'] = hl._nd.array([[1, 2], [3, 4]])

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv",
                                              impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    burden_ds = hl.import_vcf('data/example_burden.vcf')
    burden_kt = hl.import_table('data/example_burden.tsv',
                                key='Sample',
                                impute=True)
    burden_ds = burden_ds.annotate_cols(burden=burden_kt[burden_ds.s])
    burden_ds = burden_ds.annotate_rows(
        weight=hl.float64(burden_ds.locus.position))
    burden_ds = hl.variant_qc(burden_ds)
    genekt = hl.import_locus_intervals('data/gene.interval_list')
    burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
    burden_ds = burden_ds.checkpoint(f'output/example_burden.vds',
                                     overwrite=True)
    doctest_namespace['burden_ds'] = burden_ds

    ld_score_one_pheno_sumstats = hl.import_table(
        'data/ld_score_regression.one_pheno.sumstats.tsv',
        types={
            'locus': hl.tlocus('GRCh37'),
            'alleles': hl.tarray(hl.tstr),
            'chi_squared': hl.tfloat64,
            'n': hl.tint32,
            'ld_score': hl.tfloat64,
            'phenotype': hl.tstr,
            'chi_squared_50_irnt': hl.tfloat64,
            'n_50_irnt': hl.tint32,
            'chi_squared_20160': hl.tfloat64,
            'n_20160': hl.tint32
        },
        key=['locus', 'alleles'])
    doctest_namespace[
        'ld_score_one_pheno_sumstats'] = ld_score_one_pheno_sumstats

    mt = hl.import_matrix_table(
        'data/ld_score_regression.all_phenos.sumstats.tsv',
        row_fields={
            'locus': hl.tstr,
            'alleles': hl.tstr,
            'ld_score': hl.tfloat64
        },
        entry_type=hl.tstr)
    mt = mt.key_cols_by(phenotype=mt.col_id)
    mt = mt.key_rows_by(locus=hl.parse_locus(mt.locus),
                        alleles=mt.alleles.split(','))
    mt = mt.drop('row_id', 'col_id')
    mt = mt.annotate_entries(x=mt.x.split(","))
    mt = mt.transmute_entries(chi_squared=hl.float64(mt.x[0]),
                              n=hl.int32(mt.x[1]))
    mt = mt.annotate_rows(ld_score=hl.float64(mt.ld_score))
    doctest_namespace['ld_score_all_phenos_sumstats'] = mt

    print("finished setting up doctest...")