Exemple #1
0
 def test_import_plink_empty_bim(self):
     mt = get_dataset().drop_rows()
     bfile = '/tmp/test_empty_bim'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError,
                                 ".bim file does not contain any variants"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Exemple #2
0
    def test_import_plink_skip_invalid_loci(self):
        mt = hl.import_plink(resource('skip_invalid_loci.bed'),
                             resource('skip_invalid_loci.bim'),
                             resource('skip_invalid_loci.fam'),
                             reference_genome='GRCh37',
                             skip_invalid_loci=True)
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            hl.import_plink(resource('skip_invalid_loci.bed'),
                            resource('skip_invalid_loci.bim'),
                            resource('skip_invalid_loci.fam'))
Exemple #3
0
    def test_import_plink_skip_invalid_loci(self):
        mt = hl.import_plink(resource('skip_invalid_loci.bed'),
                             resource('skip_invalid_loci.bim'),
                             resource('skip_invalid_loci.fam'),
                             reference_genome='GRCh37',
                             skip_invalid_loci=True)
        self.assertTrue(mt._force_count_rows() == 3)

        with self.assertRaisesRegex(FatalError, 'Invalid locus'):
            hl.import_plink(resource('skip_invalid_loci.bed'),
                            resource('skip_invalid_loci.bim'),
                            resource('skip_invalid_loci.fam'))
Exemple #4
0
def test_plink(spark):
    input_base = 'test-data/plink/five-samples-five-variants/bed-bim-fam/test'
    # Do not recode contigs (eg. 23 -> X)
    hail_df = functions.from_matrix_table(
        hl.import_plink(bed=input_base + '.bed',
                        bim=input_base + '.bim',
                        fam=input_base + '.fam',
                        reference_genome=None,
                        contig_recoding={}))

    # Hail does not set the genotype if it is missing; the Glow PLINK reader sets the calls to (-1, -1)
    # Hail sets the genotype phased=False when reading from PLINK if the genotype is present;
    # the Glow PLINK reader does not as it is always false
    glow_df = spark.read.format('plink') \
        .option('mergeFidIid', 'false') \
        .load(input_base + '.bed')
    _compare_struct_types(hail_df.schema,
                          glow_df.schema,
                          ignore_fields=['phased'])
    matching_glow_df = glow_df.withColumn(
        'genotypes',
        fx.expr(
            "transform(genotypes, gt -> named_struct('sampleId', gt.sampleId, 'calls', ifnull(gt.calls, array(-1,-1)), 'phased', if(gt.calls = array(-1, -1), null, false)))"
        ))
    matching_hail_df = hail_df.select(*glow_df.schema.names)
    assert matching_hail_df.subtract(matching_glow_df).count() == 0
    assert matching_glow_df.subtract(matching_hail_df).count() == 0
    def test_linear_mixed_regression_full_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_fastlmm = 0.142761
        h2_places = 6
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit()
        self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model)
        assert np.allclose(ht.beta.collect(), beta_fastlmm)
        assert np.allclose(ht.p_value.collect(), pval_hail)
    def test_linear_mixed_regression_low_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_hail = 0.10001626
        beta_hail = [0.0073201542, 0.039969148, -0.036727875, 0.29852363, -0.049212500]
        pval_hail = [0.90685162, 0.54839177, 0.55001054, 9.85247263e-07, 0.42796507]

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit()
        self.assertTrue(model.low_rank)
        self.assertAlmostEqual(model.h_sq, h2_hail)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model)
        assert np.allclose(ht.beta.collect(), beta_hail)
        assert np.allclose(ht.p_value.collect(), pval_hail)
Exemple #7
0
def test_king_filtered_entries_no_error():
    plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam')
    mt = mt.filter_entries(hl.rand_bool(0.5))
    hl.king(mt.GT)._force_count_rows()
Exemple #8
0
def test_king_large():
    plink_path = resource('fastlmmTest')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam',
                         reference_genome=None)
    kinship = hl.king(mt.GT)
    assert_c_king_same_as_hail_king(resource('fastlmmTest.kin0.bgz'), kinship)
Exemple #9
0
def read_plink(dirname: str, basename: str) -> hl.MatrixTable:

    in_mt: hl.MatrixTable = hl.import_plink(bed=dirname + basename + '.bed',
                                            bim=dirname + basename + '.bim',
                                            fam=dirname + basename + '.fam',
                                            block_size=16)

    return in_mt
Exemple #10
0
 def test_import_plink_no_reference_specified(self):
     bfile = resource('fastlmmTest')
     plink = hl.import_plink(bfile + '.bed',
                             bfile + '.bim',
                             bfile + '.fam',
                             reference_genome=None)
     self.assertTrue(plink.locus.dtype == hl.tstruct(contig=hl.tstr,
                                                     position=hl.tint32))
Exemple #11
0
def test_king_small():
    plink_path = resource('balding-nichols-1024-variants-4-samples-3-populations')
    mt = hl.import_plink(bed=f'{plink_path}.bed',
                         bim=f'{plink_path}.bim',
                         fam=f'{plink_path}.fam')
    kinship = hl.king(mt.GT)
    assert_c_king_same_as_hail_king(
        resource('balding-nichols-1024-variants-4-samples-3-populations.kin0'),
        kinship)
Exemple #12
0
def import_from_plink(ref_panel):

    mt = hl.import_plink(bed=f'{wd}/{ref_panel}.bed',
                         bim=f'{wd}/{ref_panel}.bim',
                         fam=f'{wd}/{ref_panel}.fam')
    X = hl.linalg.BlockMatrix.from_entry_expr(mt.GT.n_alt_alleles())
    X = X.T

    X.write(f'{wd}/{ref_panel}.X.bm', overwrite=True)
Exemple #13
0
def load_data_plink(dirname, basename):
    """
    Loads data plink dataset, return matrix table
    :param dirname: plink file directory name
    :param basename: plink base filename
    :return: MatrixTable
    """
    mt = hl.import_plink(bed=dirname + basename + '.bed',
                         bim=dirname + basename + '.bim',
                         fam=dirname + basename + '.fam',
                         min_partitions=100)
    mt = filter_snps(mt, 0.1)
    return mt
def get_ukbb_plink_data(data_source) -> str:
    if data_source == "UKBB_regeneron":
        mt = hl.import_plink(
            bed=
            "gs://fc-fdd512d3-61cc-4e5e-8701-c33342a9feb4/wave01/plink/ukb_evc_chr1_v1.bed",
            bim=
            "gs://fc-fdd512d3-61cc-4e5e-8701-c33342a9feb4/wave01/plink/ukb_spb_exm_chrall_v1.bim",
            fam=f'{data_prefix(data_source)}/ukb27892_evc_chr1_v1_s49959.fam',
            reference_genome="GRCh38",
            skip_invalid_loci=True)
    elif data_source == "UKBB_gatk":
        mt = hl.import_plink(
            bed=
            "gs://fc-72d33328-e60d-4e5a-96e2-03fe2a0c8ae8/wave01/plink/ukb_efe_chr1_v1.bed",
            bim=
            "gs://fc-72d33328-e60d-4e5a-96e2-03fe2a0c8ae8/wave01/plink/ukb_fe_exm_chrall_v1.bim",
            fam=f'{data_prefix(data_source)}/ukb27892_efe_chr1_v1_s49959.fam',
            reference_genome="GRCh38",
            skip_invalid_loci=True)
    else:
        raise DataException("This data_source doesn't have plink data")
    return mt
Exemple #15
0
def main():

    # Parse args
    args = parse_args()

    # Prepare liftover
    rg37 = hl.get_reference('GRCh37')
    rg38 = hl.get_reference('GRCh38')
    rg37.add_liftover(args.chainfile, rg38)

    # Create my own rg38 with altered names
    rg38_custom_contigs = [
        contig.replace('chr', '') for contig in rg38.contigs
    ]
    rg38_custom_lens = {}
    for contig in rg38.lengths:
        rg38_custom_lens[contig.replace('chr', '')] = rg38.lengths[contig]
    rg38_custom = hl.ReferenceGenome('rg38_custom', rg38_custom_contigs,
                                     rg38_custom_lens)

    # Load plink
    mt = hl.import_plink(bed=args.in_plink + '.bed',
                         bim=args.in_plink + '.bim',
                         fam=args.in_plink + '.fam',
                         reference_genome='GRCh37',
                         min_partitions=args.min_partitions)

    # # Re-call to remove phasing (required for plink output)
    # mt = mt.annotate_entries(GT=hl.call(mt.GT[0], mt.GT[1], phased=False))

    # Liftover
    mt = mt.annotate_rows(locus_GRCh38=hl.liftover(mt.locus, 'GRCh38'))

    # Strip chr from contig name (causes problems with GCTA)
    mt = mt.annotate_rows(
        contig_GRCh38=mt.locus_GRCh38.contig.replace('chr', ''))

    # Swap GRCh37 locus for GRCh38 (but have to use rg38_custom)
    mt = mt.key_rows_by()
    mt = mt.annotate_rows(locus=hl.locus(mt.contig_GRCh38,
                                         mt.locus_GRCh38.position,
                                         reference_genome=rg38_custom))
    mt = mt.key_rows_by(mt.locus, mt.alleles)

    # Remove rows with missing locus (after liftover)
    mt = mt.filter_rows(hl.is_defined(mt.locus))

    # Write plink format
    hl.export_plink(dataset=mt, output=args.out_plink)

    return 0
Exemple #16
0
def load_ref(dirname, basename):
    """
    Loads a reference plink dataset, writes out a matrix table
    :param dirname: plink file directory name
    :param basename: plink base filename
    :return:
    """
    ref = hl.import_plink(bed=dirname + basename + '.bed',
                          bim=dirname + basename + '.bim',
                          fam=dirname + basename + '.fam',
                          min_partitions=100)
    ref.describe()

    print('sites in ref data: ' + str(ref.count()))  # (639590, 3547)
    ref.write(dirname + basename + '.mt', args.overwrite)
Exemple #17
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([mt.locus.contig, hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr), pat_id=hl.null(hl.tstr), mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool), is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                                      a2_reference=True, reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
Exemple #18
0
    def test_import_plink_contig_recoding_w_reference(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Exemple #19
0
    def test_import_plink(self):
        vcf = hl.split_multi_hts(
            hl.import_vcf(resource('sample2.vcf'),
                          reference_genome=hl.get_reference('GRCh38'),
                          contig_recoding={"22": "chr22"}))

        hl.export_plink(vcf, '/tmp/sample_plink')

        bfile = '/tmp/sample_plink'
        plink = hl.import_plink(
            bfile + '.bed', bfile + '.bim', bfile + '.fam',
            a2_reference=True,
            contig_recoding={'chr22': '22'},
            reference_genome='GRCh37').rows()
        self.assertTrue(plink.all(plink.locus.contig == "22"))
        self.assertEqual(vcf.count_rows(), plink.count())
        self.assertTrue(plink.locus.dtype, hl.tlocus('GRCh37'))
Exemple #20
0
    def test_linear_mixed_regression_low_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'),
                                  no_header=True,
                                  impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'),
                                  no_header=True,
                                  impute=True,
                                  delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_hail = 0.10001626
        beta_hail = [
            0.0073201542, 0.039969148, -0.036727875, 0.29852363, -0.049212500
        ]
        pval_hail = [
            0.90685162, 0.54839177, 0.55001054, 9.85247263e-07, 0.42796507
        ]

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1')
                                 & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y,
                                         x=[1, mt_chr1.x],
                                         z_t=mt_chr1.GT.n_alt_alleles(),
                                         p_path=p_path)
        model.fit()
        self.assertTrue(model.low_rank)
        self.assertAlmostEqual(model.h_sq, h2_hail)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3')
                                 & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(
            stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows(
            (mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) /
            mt_chr3.stats.stdev, model)
        assert np.allclose(ht.beta.collect(), beta_hail)
        assert np.allclose(ht.p_value.collect(), pval_hail)
Exemple #21
0
    def test_linear_mixed_regression_full_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'),
                                  no_header=True,
                                  impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'),
                                  no_header=True,
                                  impute=True,
                                  delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_fastlmm = 0.142761
        h2_places = 6
        beta_fastlmm = [
            0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170
        ]
        pval_hail = [
            0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204
        ]

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        model, _ = hl.linear_mixed_model(y=mt_chr1.y,
                                         x=[1, mt_chr1.x],
                                         z_t=mt_chr1.GT.n_alt_alleles(),
                                         p_path=p_path)
        model.fit()
        self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3')
                                 & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(
            stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows(
            (mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) /
            mt_chr3.stats.stdev, model)
        assert np.allclose(ht.beta.collect(), beta_fastlmm)
        assert np.allclose(ht.p_value.collect(), pval_hail)
    def test_linear_mixed_regression_pass_through(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit(log_gamma=0)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1)))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position])

        assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
Exemple #23
0
    def test_linear_mixed_regression_pass_through(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'),
                                  no_header=True,
                                  impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'),
                                  no_header=True,
                                  impute=True,
                                  delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1')
                                 & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y,
                                         x=[1, mt_chr1.x],
                                         z_t=mt_chr1.GT.n_alt_alleles(),
                                         p_path=p_path)
        model.fit(log_gamma=0)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3')
                                 & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(
            mt_chr3.GT.n_alt_alleles()),
                                        foo=hl.struct(bar=hl.rand_norm(0, 1)))
        ht = hl.linear_mixed_regression_rows(
            (mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) /
            mt_chr3.stats.stdev,
            model,
            pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position])

        assert mt_chr3.aggregate_rows(
            hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
Exemple #24
0
def get_ref_X(ref_panel, overwrite=False):
    r'''
    Returns N_ref x M dim matrix of column-standardized genotypes of LD ref panel
    '''
    X_bm_path = f'{bucket}/{ref_panel}.X.bm'

    if overwrite or not hl.hadoop_is_file(f'{X_bm_path}/_SUCCESS'):
        mt = hl.import_plink(bed=f'{bucket}/{ref_panel}.bed',
                             bim=f'{bucket}/{ref_panel}.bim',
                             fam=f'{bucket}/{ref_panel}.fam')

        mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles()))
        mt = mt.annotate_entries(X=(mt.GT.n_alt_alleles() - mt.stats.mean) /
                                 mt.stats.stdev)

        X = BlockMatrix.from_entry_expr(mt.X)
        X = X.T

        X.write(f'{bucket}/{ref_panel}.X.bm', overwrite=True)

    X = BlockMatrix.read(X_bm_path)

    return X
Exemple #25
0
    def test_export_import_plink_same(self):
        mt = get_dataset()
        mt = mt.select_rows(rsid=hl.delimit([
            mt.locus.contig,
            hl.str(mt.locus.position), mt.alleles[0], mt.alleles[1]
        ], ':'),
                            cm_position=15.0)
        mt = mt.select_cols(fam_id=hl.null(hl.tstr),
                            pat_id=hl.null(hl.tstr),
                            mat_id=hl.null(hl.tstr),
                            is_female=hl.null(hl.tbool),
                            is_case=hl.null(hl.tbool))
        mt = mt.select_entries('GT')

        bfile = '/tmp/test_import_export_plink'
        hl.export_plink(mt, bfile, ind_id=mt.s, cm_position=mt.cm_position)

        mt_imported = hl.import_plink(bfile + '.bed',
                                      bfile + '.bim',
                                      bfile + '.fam',
                                      a2_reference=True,
                                      reference_genome='GRCh37')
        self.assertTrue(mt._same(mt_imported))
        self.assertTrue(mt.aggregate_rows(hl.agg.all(mt.cm_position == 15.0)))
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T
        y = np.array(mt.key_cols_by()['y'].collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        ld = g_std.T @ g_std
        sl, v = np.linalg.eigh(ld)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
# Simulate genetically unrelated phenotypes with h2 = 0.1, 0.3

# h2 = [0.1, 0.1, 0.1, 0.1, 0.3, 0.3, 0.3, 0.3]
# rg = [0]*28
# out = 'gs://.../sim_350k_uncorrelated.mt'

# Simulate genetically correlated phenotypes with same h2 = 0.3 and different rgs + one uncorrelated

h2 = [0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3, 0.3]
rg = [-0.5, -0.3, -0.1, 0, 0.1, 0.3, 0.5, 0] + 28 * [0]
out = 'gs://.../sim_350k_correlated.mt'

# # Simulations
# Import UKBB hm3 genotype data
mt = hl.import_plink(bed='gs://.../ukb_imp_chr1_v3.bed',
                     bim='gs://.../ukb_imp_chr1_v3.bim',
                     fam='gs://.../ukb_imp_chr1_v3.fam',
                     reference_genome='GRCh37')

for chrom in range(2, 23):
    mtT = hl.import_plink(bed='gs://.../ukb_imp_chr%s_v3.bed' % chrom,
                          bim='gs://.../ukb_imp_chr%s_v3.bim' % chrom,
                          fam='gs://.../ukb_imp_chr%s_v3.fam' % chrom,
                          reference_genome='GRCh37')
    mt = mt.union_rows(mtT)

# Keep only unrelated (~361k samples)
tb2 = hl.import_table('gs://.../unrelated_samples.txt', impute=True)
tb2 = tb2.annotate(s_str=hl.str(tb2.s)).key_by('s_str')
mt = mt.semi_join_cols(tb2).add_col_index()
mt = mt.annotate_cols(s_index=mt.col_idx).key_cols_by('s_index')
Exemple #28
0
    def test_ld_score(self):

        ht = hl.import_table(doctest_resource('ldsc.annot'),
                             types={
                                 'BP': hl.tint,
                                 'CM': hl.tfloat,
                                 'binary': hl.tint,
                                 'continuous': hl.tfloat
                             })
        ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
        ht = ht.key_by('locus')

        mt = hl.import_plink(bed=doctest_resource('ldsc.bed'),
                             bim=doctest_resource('ldsc.bim'),
                             fam=doctest_resource('ldsc.fam'))
        mt = mt.annotate_rows(binary=ht[mt.locus].binary,
                              continuous=ht[mt.locus].continuous)

        ht_univariate = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position)

        ht_annotated = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position,
            annotation_exprs=[mt.binary, mt.continuous])

        univariate = ht_univariate.aggregate(
            hl.struct(chr20=hl.agg.filter(
                (ht_univariate.locus.contig == '20') &
                (ht_univariate.locus.position == 82079),
                hl.agg.collect(ht_univariate.univariate))[0],
                      chr22=hl.agg.filter(
                          (ht_univariate.locus.contig == '22') &
                          (ht_univariate.locus.position == 16894090),
                          hl.agg.collect(ht_univariate.univariate))[0],
                      mean=hl.agg.mean(ht_univariate.univariate)))

        self.assertAlmostEqual(univariate.chr20, 1.601, places=3)
        self.assertAlmostEqual(univariate.chr22, 1.140, places=3)
        self.assertAlmostEqual(univariate.mean, 3.507, places=3)

        annotated = ht_annotated.aggregate(
            hl.struct(chr20=hl.struct(
                binary=hl.agg.filter((ht_annotated.locus.contig == '20') &
                                     (ht_annotated.locus.position == 82079),
                                     hl.agg.collect(ht_annotated.binary))[0],
                continuous=hl.agg.filter(
                    (ht_annotated.locus.contig == '20') &
                    (ht_annotated.locus.position == 82079),
                    hl.agg.collect(ht_annotated.continuous))[0]),
                      chr22=hl.struct(
                          binary=hl.agg.filter(
                              (ht_annotated.locus.contig == '22') &
                              (ht_annotated.locus.position == 16894090),
                              hl.agg.collect(ht_annotated.binary))[0],
                          continuous=hl.agg.filter(
                              (ht_annotated.locus.contig == '22') &
                              (ht_annotated.locus.position == 16894090),
                              hl.agg.collect(ht_annotated.continuous))[0]),
                      mean_stats=hl.struct(
                          binary=hl.agg.mean(ht_annotated.binary),
                          continuous=hl.agg.mean(ht_annotated.continuous))))

        self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3)
        self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3)
        self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3)
        self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3)
        self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3)
        self.assertAlmostEqual(annotated.mean_stats.continuous,
                               176.528,
                               places=3)
Exemple #29
0
 def test_import_plink_empty_fam(self):
     mt = get_dataset().filter_cols(False)
     bfile = '/tmp/test_empty_fam'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, "Empty .fam file"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Exemple #30
0
 def test_import_plink_empty_bim(self):
     mt = get_dataset().filter_rows(False)
     bfile = '/tmp/test_empty_bim'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, ".bim file does not contain any variants"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Exemple #31
0
 def get_data(a2_reference):
     mt_imported = hl.import_plink(bfile + '.bed',
                                   bfile + '.bim',
                                   bfile + '.fam',
                                   a2_reference=a2_reference)
     return (hl.variant_qc(mt_imported).rows().key_by('rsid'))
Exemple #32
0
 def get_data(a2_reference):
     mt_imported = hl.import_plink(bfile + '.bed', bfile + '.bim',
                                   bfile + '.fam', a2_reference=a2_reference)
     return (hl.variant_qc(mt_imported)
             .rows()
             .key_by('rsid'))
Exemple #33
0
 def test_import_plink_empty_fam(self):
     mt = get_dataset().drop_cols()
     bfile = '/tmp/test_empty_fam'
     hl.export_plink(mt, bfile, ind_id=mt.s)
     with self.assertRaisesRegex(FatalError, "Empty .fam file"):
         hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam')
Exemple #34
0
 def test_import_plink_no_reference_specified(self):
     bfile = resource('fastlmmTest')
     plink = hl.import_plink(bfile + '.bed', bfile + '.bim', bfile + '.fam',
                             reference_genome=None)
     self.assertTrue(
         plink.locus.dtype == hl.tstruct(contig=hl.tstr, position=hl.tint32))
Exemple #35
0
def read_plink(dirname: str, basename: str, reference: str = 'GRCh38') -> hl.MatrixTable:
    hl.init(default_reference=reference)
    mt: hl.MatrixTable = hl.import_plink(bed=dirname + basename + '.bed',
                                         bim=dirname + basename + '.bim',
                                         fam=dirname + basename + '.fam')
    return mt
Exemple #36
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {
            'f0': 'fam_id',
            'f1': 'ind_id',
            'f2': 'pat_id',
            'f3': 'mat_id',
            'f4': 'is_female',
            'f5': 'pheno'
        }
        bim_mapping = {
            'f0': 'contig',
            'f1': 'varid',
            'f2': 'cm_position',
            'f3': 'position',
            'f4': 'a1',
            'f5': 'a2'
        }

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0")
                     & (fam1.mat_id == "0") & (fam1.is_female == "0")
                     & (fam1.pheno == "NA")))
        self.assertTrue(
            bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" +
                      bim1.a2 + ":" + bim1.a1) & (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds,
                        out2,
                        ind_id=ds.s,
                        fam_id=ds.s,
                        pat_id="nope",
                        mat_id="nada",
                        is_female=True,
                        pheno=False)
        fam2 = (hl.import_table(out2 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope")
                     & (fam2.mat_id == "nada") & (fam2.is_female == "2")
                     & (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam',
                                no_header=True,
                                impute=False,
                                missing="").rename(fam_mapping))

        self.assertTrue(
            fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0")
                     & (fam3.mat_id == "0") & (fam3.is_female == "0")
                     & (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True,
                                impute=False).rename(bim_mapping))

        self.assertTrue(
            bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim',
                                         out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(
            hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError,
                                    "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
    output=
    'gs://unicorn-resources/Ashkenazi_Jewish_Samples/Ashkenazi_Jewish_Samples',
    varid=mt_AJ.rsid,
    cm_position=mt_AJ.cm_position)

######################################################################################################
# This part of the code is to convert 1KG PLINK BFILE to hail matrix table                           #
# The source of the 1KG PLINK bfiles:                                                                #
#    `/psych/genetics_data/ripke/references_outdated/hapmap_ref/impute2_ref/1KG_Aug12/               #
#      ALL_1000G_phase1integrated_v3_impute_macGT1/4pops/qc/pop_4pop_mix_SEQ`                        #
# It is a cleaned PLINK BFILE generated by Stephan.                                                  #
######################################################################################################
mt_1kg = hl.import_plink(
    bed=
    'gs://unicorn-resources/1000_genomes/mix/mix_ready4QC_AF_HRC/mix.all.final.bed',
    bim=
    'gs://unicorn-resources/1000_genomes/mix/mix_ready4QC_AF_HRC/mix.all.final.bim',
    fam=
    'gs://unicorn-resources/1000_genomes/mix/mix_ready4QC_AF_HRC/mix.all.final.fam',
    min_partitions=500)
mt_1kg = mt_1kg.annotate_cols(super_population=mt_1kg.fam_id.split("_")[3])
mt_1kg.write("gs://unicorn-resources/1000_genomes/pop_4pop_mix_SEQ.mt",
             overwrite=True)

######################################################################################################
# This part of the code is to use to generate a reference for EUR samples                            #
# The source of the 1KG BFILES can be found at                                                       #
#    `/psych/genetics_data/ripke/references_outdated/hapmap_ref/impute2_ref/1KG_Aug12/               #
#       ALL_1000G_phase1integrated_v3_impute_macGT1/4pops/qc/pop_euro_eur_SEQ`                       #
# The file `ALL_1000G_phase1integrated_feb2012.sample.nohead.fam` in the same directory contains     #
#    the population label                                                                            #
######################################################################################################
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

# ## key step
# ### 1. extract pca info, transform it to dataframe
# ### 2. build linear regression model, predict y and get y residuals
# ### 3. store y residuals in hail MatrixTable
# ### 4. run gwas and compare time

# Import a PLINK dataset (BED, BIM, FAM) as a MatrixTable
vds = hl.import_plink('gs://ukb_testdata/maf_0.01_10.bed',
                      'gs://ukb_testdata/maf_0.01_10.bim',
                      'gs://ukb_testdata/maf_0.01_10.fam')

# Import delimited text file (text table) as Table
# import phenotype
table = (hl.import_table('gs://ukb_testdata/sleep_duration.tsv',
                         delimiter='\t',
                         no_header=True,
                         missing='NA',
                         impute=True,
                         types={
                             'f0': hl.tstr
                         }).key_by('f0'))
# table.show()
vds = vds.annotate_cols(**table[vds.s])
# print(vds.col.dtype.pretty())
Exemple #39
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id',
                       'f4': 'is_female', 'f5': 'pheno'}
        bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position',
                       'f3': 'position', 'f4': 'a1', 'f5': 'a2'}

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") &
                                 (fam1.mat_id == "0") & (fam1.is_female == "0") &
                                 (fam1.pheno == "NA")))
        self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) &
                                 (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, pheno=False)
        fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") &
                                 (fam2.mat_id == "nada") & (fam2.is_female == "2") &
                                 (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") &
                                 (fam3.mat_id == "0") & (fam3.is_female == "0") &
                                 (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError, "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.x.collect()]).T
        y = np.array(mt.y.collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
Exemple #41
0
def main(args):
    if args.load_ref:
        ref = hl.import_plink(bed='gs://ukb-diverse-pops/pca/data/' +
                              args.pop + 'HGDP_1kG_maf005_geno05.bed',
                              bim='gs://ukb-diverse-pops/pca/data/' +
                              args.pop + 'HGDP_1kG_maf005_geno05.bim',
                              fam='gs://ukb-diverse-pops/pca/data/' +
                              args.pop + 'HGDP_1kG_maf005_geno05.fam',
                              min_partitions=100)
        ref.describe()

        print('sites in ref data: ' + str(ref.count()))  # (639590, 3547)
        ref.write('gs://ukb-diverse-pops/pca/data/HGDP_1kG_maf005_geno05.mt',
                  args.overwrite)

    if args.load_ukbb:
        ref = hl.read_matrix_table('gs://ukb-diverse-pops/pca/data/' +
                                   args.pop + 'HGDP_1kG_maf005_geno05.mt')
        samples = hl.read_table(
            'gs://armartin/pigmentation/pigmentation_phenos_covs_pops.ht')
        ukbb = hl.read_matrix_table(
            'gs://phenotype_31063/hail/genotype/ukb31063.genotype.mt')
        ukbb = ukbb.annotate_cols(**samples[ukbb.s])

        # filter ukbb to sites in ref & array data
        ukbb_in_ref = ukbb.filter_rows(hl.is_defined(ref.rows()[ukbb.row_key]))
        print('sites, inds in ref and UKBB data: ' +
              str(ukbb_in_ref.count()))  # (64233, 488377)

        ukbb_in_ref.write(
            'gs://ukb-diverse-pops/pca/data/' + args.pop + 'ukbb_globalref.mt',
            args.overwrite)

        # filter ref to ukbb sites
        ref_in_ukbb = ref.filter_rows(hl.is_defined(ukbb.rows()[ref.row_key]))
        print('sites, inds in ref and UKBB data: ' +
              str(ref_in_ukbb.count()))  # (64233, 3547)
        ref_in_ukbb.write(
            'gs://ukb-diverse-pops/pca/data/globalref_ukbb_intersect.mt',
            args.overwrite)

        # filter ukbb to unrel individuals
        # ukbb_in_ref_unrel = ukbb_in_ref.filter_cols(ukbb_in_ref.covariates['used_in_pca_calculation'])
    if args.global_pca:
        """
        Compute PCA in global reference panel, project UKBB individuals into PCA space
        """
        ref_in_ukbb = hl.read_matrix_table(
            'gs://ukb-diverse-pops/pca/data/globalref_ukbb_intersect.mt')
        print('Computing reference PCs')
        run_pca(ref_in_ukbb, 'gs://ukb-diverse-pops/pca/data/globalref_ukbb_')

        # project ukbb
        project_individuals('gs://ukb-diverse-pops/pca/data/globalref_ukbb_',
                            'gs://ukb-diverse-pops/pca/data/ukbb_globalref')

    if args.continental_pca:
        """
        Compute PCA within reference panel super pops, project UKBB individuals into PCA space
        1. Filter UKBB to individuals in continental population
        2. Run PCA on continental ref
        3. Project UKBB inds
        """
        pass

    if args.ukbb_pop_pca:
        """
        Compute PCA in each UKBB population (unrelateds), project reference individuals and relateds into PCA space
        1. Filter UKBB to individuals in continental population
        2. Run PC-relate on these individuals
        3. Filter UKBB population to unrelated individuals
        4. Run PCA on UKBB population 
        5. Project reference panel 
        """
        pass

    if args.ukbb_pop_noref:
        """
        Compute PCA in UKBB population (unrelateds), without reference individuals
            Denser SNP set for more precise PC calculation
            These will be used as covariates
        """
        pass
Exemple #42
0
    def test_ld_score(self):

        ht = hl.import_table(doctest_resource('ldsc.annot'),
                             types={'BP': hl.tint,
                                    'CM': hl.tfloat,
                                    'binary': hl.tint,
                                    'continuous': hl.tfloat})
        ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
        ht = ht.key_by('locus')

        mt = hl.import_plink(bed=doctest_resource('ldsc.bed'),
                             bim=doctest_resource('ldsc.bim'),
                             fam=doctest_resource('ldsc.fam'))
        mt = mt.annotate_rows(binary=ht[mt.locus].binary,
                              continuous=ht[mt.locus].continuous)

        ht_univariate = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position)

        ht_annotated = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position,
            annotation_exprs=[mt.binary,
                              mt.continuous])

        univariate = ht_univariate.aggregate(hl.struct(
            chr20=hl.agg.filter(
                (ht_univariate.locus.contig == '20') &
                (ht_univariate.locus.position == 82079),
                hl.agg.collect(ht_univariate.univariate))[0],
            chr22 =hl.agg.filter(
                (ht_univariate.locus.contig == '22') &
                (ht_univariate.locus.position == 16894090),
                hl.agg.collect(ht_univariate.univariate))[0],
            mean=hl.agg.mean(ht_univariate.univariate)))

        self.assertAlmostEqual(univariate.chr20, 1.601, places=3)
        self.assertAlmostEqual(univariate.chr22, 1.140, places=3)
        self.assertAlmostEqual(univariate.mean, 3.507, places=3)

        annotated = ht_annotated.aggregate(
            hl.struct(
                chr20=hl.struct(binary=hl.agg.filter(
                    (ht_annotated.locus.contig == '20') &
                    (ht_annotated.locus.position == 82079),
                    hl.agg.collect(ht_annotated.binary))[0],
                                continuous=hl.agg.filter(
                                    (ht_annotated.locus.contig == '20') &
                                    (ht_annotated.locus.position == 82079),
                                    hl.agg.collect(ht_annotated.continuous))[0]),
                chr22=hl.struct(
                    binary=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.binary))[0],
                    continuous=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.continuous))[0]),
                mean_stats=hl.struct(binary=hl.agg.mean(ht_annotated.binary),
                                     continuous=hl.agg.mean(ht_annotated.continuous))))

        self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3)
        self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3)
        self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3)
        self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3)
        self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3)
        self.assertAlmostEqual(annotated.mean_stats.continuous, 176.528, places=3)
Exemple #43
0
    def test_ld_score(self):

        ht = hl.import_table(doctest_resource('ldsc.annot'),
                             types={
                                 'BP': hl.tint,
                                 'CM': hl.tfloat,
                                 'univariate': hl.tfloat,
                                 'binary': hl.tfloat,
                                 'continuous': hl.tfloat
                             })
        ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
        ht = ht.key_by('locus')

        mt = hl.import_plink(bed=doctest_resource('ldsc.bed'),
                             bim=doctest_resource('ldsc.bim'),
                             fam=doctest_resource('ldsc.fam'))
        mt = mt.annotate_rows(stats=hl.agg.stats(mt.GT.n_alt_alleles()))
        mt = mt.annotate_rows(univariate=1,
                              binary=ht[mt.locus].binary,
                              continuous=ht[mt.locus].continuous)

        mt = mt.annotate_entries(
            GT_std=hl.or_else((mt.GT.n_alt_alleles() - mt.stats.mean) /
                              mt.stats.stdev, 0.0))

        ht_scores = hl.experimental.ld_score(
            entry_expr=mt.GT_std,
            annotation_exprs=[mt.univariate, mt.binary, mt.continuous],
            position_expr=mt.cm_position,
            window_size=1)

        chr20_firsts = ht_scores.aggregate(
            hl.struct(univariate=hl.agg.collect(
                hl.agg.filter((ht_scores.locus.contig == '20') &
                              (ht_scores.locus.position == 82079),
                              ht_scores.univariate))[0],
                      binary=hl.agg.collect(
                          hl.agg.filter((ht_scores.locus.contig == '20') &
                                        (ht_scores.locus.position == 82079),
                                        ht_scores.binary))[0],
                      continuous=hl.agg.collect(
                          hl.agg.filter((ht_scores.locus.contig == '20') &
                                        (ht_scores.locus.position == 82079),
                                        ht_scores.continuous))[0]))

        self.assertAlmostEqual(chr20_firsts.univariate, 1.601, places=3)
        self.assertAlmostEqual(chr20_firsts.binary, 1.152, places=3)
        self.assertAlmostEqual(chr20_firsts.continuous, 73.014, places=3)

        chr22_firsts = ht_scores.aggregate(
            hl.struct(univariate=hl.agg.collect(
                hl.agg.filter((ht_scores.locus.contig == '22') &
                              (ht_scores.locus.position == 16894090),
                              ht_scores.univariate))[0],
                      binary=hl.agg.collect(
                          hl.agg.filter((ht_scores.locus.contig == '22') &
                                        (ht_scores.locus.position == 16894090),
                                        ht_scores.binary))[0],
                      continuous=hl.agg.collect(
                          hl.agg.filter((ht_scores.locus.contig == '22') &
                                        (ht_scores.locus.position == 16894090),
                                        ht_scores.continuous))[0]))

        self.assertAlmostEqual(chr22_firsts.univariate, 1.140, places=3)
        self.assertAlmostEqual(chr22_firsts.binary, 1.107, places=3)
        self.assertAlmostEqual(chr22_firsts.continuous, 102.174, places=3)

        means = ht_scores.aggregate(
            hl.struct(univariate=hl.agg.mean(ht_scores.univariate),
                      binary=hl.agg.mean(ht_scores.binary),
                      continuous=hl.agg.mean(ht_scores.continuous)))

        self.assertAlmostEqual(means.univariate, 3.507, places=3)
        self.assertAlmostEqual(means.binary, 0.965, places=3)
        self.assertAlmostEqual(means.continuous, 176.528, places=3)