Example #1
0
    def test_linear_mixed_regression_full_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_fastlmm = 0.142761
        h2_places = 6
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit()
        self.assertAlmostEqual(model.h_sq, h2_fastlmm, places=h2_places)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model)
        assert np.allclose(ht.beta.collect(), beta_fastlmm)
        assert np.allclose(ht.p_value.collect(), pval_hail)
Example #2
0
    def test_import_keyby_count_ldsc_lowered_shuffle(self):
        # integration test pulled out of test_ld_score_regression to isolate issues with lowered shuffles
        # and RDD serialization, 2021-07-06
        # if this comment no longer reflects the backend system, that's a really good thing
        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP',
            types={
                'L2': hl.tfloat,
                'BP': hl.tint
            })

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP',
            types={
                'N': hl.tint,
                'Z': hl.tfloat
            })

        j1 = ht_scores[ht_20160['SNP']]
        ht_20160 = ht_20160.annotate(ld_score=j1['L2'],
                                     locus=hl.locus(j1['CHR'], j1['BP']),
                                     alleles=hl.array(
                                         [ht_20160['A2'], ht_20160['A1']]))

        ht_20160 = ht_20160.key_by(ht_20160['locus'], ht_20160['alleles'])
        assert ht_20160._force_count() == 151
Example #3
0
    def test_linear_mixed_regression_low_rank(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        h2_hail = 0.10001626
        beta_hail = [0.0073201542, 0.039969148, -0.036727875, 0.29852363, -0.049212500]
        pval_hail = [0.90685162, 0.54839177, 0.55001054, 9.85247263e-07, 0.42796507]

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit()
        self.assertTrue(model.low_rank)
        self.assertAlmostEqual(model.h_sq, h2_hail)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model)
        assert np.allclose(ht.beta.collect(), beta_hail)
        assert np.allclose(ht.p_value.collect(), pval_hail)
Example #4
0
def table_import_ints(tsv):
    hl.import_table(tsv,
                    types={
                        'idx': 'int',
                        **{f'i{i}': 'int'
                           for i in range(5)},
                        **{f'array{i}': 'array<int>'
                           for i in range(2)}
                    })._force_count()
Example #5
0
def get_hq_samples():
    ht = hl.import_table(f'{bucket}/misc/ukb31063_samples_qc_FULL.txt',
                         no_header=True)
    drop_samples = hl.import_table(
        f'{bucket}/misc/ukb31063.withdrawn_samples_20190321.txt',
        no_header=True,
        key='f0')
    ht = ht.key_by(s=ht.f0).drop('f0')
    return ht.filter(hl.is_missing(drop_samples[ht.s]))
Example #6
0
def table_import_ints():
    hl.import_table(resource('many_ints_table.tsv.bgz'),
                    types={
                        'idx': 'int',
                        **{f'i{i}': 'int'
                           for i in range(5)},
                        **{f'array{i}': 'array<int>'
                           for i in range(2)}
                    })._force_count()
Example #7
0
def gwas_on_gcta_splits(min_id, max_id, parsplit, paridx):
    rep_ids = range(
        min_id + paridx - 1, max_id + 1, parsplit
    )  #replicate "IDs", which were used as seeds to generate the random split
    ids = hl.import_table('gs://nbaya/split/gcta/gcta_20k.grm.id',
                          no_header=True)  #GRM ids
    ids = ids.rename({'f0': 'FID', 'f1': 'IID'})
    ids = set(ids.IID.take(ids.count()))
    mt0 = hl.read_matrix_table('gs://nbaya/ldscsim/hm3.50_sim_h2_0.08.mt/')
    mt1 = mt0.filter_cols(hl.literal(ids).contains(mt0.s))
    for i in rep_ids:
        try:
            y1_complete = subprocess.check_output([
                'gsutil', 'ls',
                f'gs://nbaya/split/gcta/20k_sumstats.y1.s{i}.tsv.bgz'
            ]) != None
        except:
            y1_complete = False
        try:
            y2_complete = subprocess.check_output([
                'gsutil', 'ls',
                f'gs://nbaya/split/gcta/20k_sumstats.y2.s{i}.tsv.bgz'
            ]) != None
        except:
            y2_complete = False
        if not (y1_complete and y2_complete):
            phen = hl.import_table(f'gs://nbaya/split/gcta/gcta_20k.s{i}.phen',
                                   types={
                                       'y1': hl.tfloat64,
                                       'y2': hl.tfloat64
                                   },
                                   key='IID')
            mt = mt1.annotate_cols(y1=phen[mt1.s].y1, y2=phen[mt1.s].y2)
            if not y1_complete:
                print(
                    f'\r##########\nRunning GWAS for y1 replicate {i} \n##########'
                )
                gwas(mt=mt,
                     x=mt.dosage,
                     y=mt.y1,
                     is_std_cov_list=True,
                     path_to_save=
                     f'gs://nbaya/split/gcta/20k_sumstats.y1.s{i}.tsv.bgz')
            if not y2_complete:
                print(
                    f'\r##########\nRunning GWAS for y2 replicate {i} \n##########'
                )
                gwas(mt=mt,
                     x=mt.dosage,
                     y=mt.y2,
                     is_std_cov_list=True,
                     path_to_save=
                     f'gs://nbaya/split/gcta/20k_sumstats.y2.s{i}.tsv.bgz')
        else:
            print(
                f'\r##########\n GWAS already complete for y1 and y2 for replicate {i} \n##########'
            )
Example #8
0
def download_data():
    global _data_dir, _mt
    _data_dir = os.environ.get('HAIL_BENCHMARK_DIR',
                               '/tmp/hail_benchmark_data')
    print(f'using benchmark data directory {_data_dir}')
    os.makedirs(_data_dir, exist_ok=True)

    files = map(lambda f: os.path.join(_data_dir, f), [
        'profile.vcf.bgz', 'profile.mt', 'table_10M_par_1000.ht',
        'table_10M_par_100.ht', 'table_10M_par_10.ht',
        'gnomad_dp_simulation.mt', 'many_strings_table.ht'
    ])
    if not all(os.path.exists(file) for file in files):
        hl.init()  # use all cores

        vcf = os.path.join(_data_dir, 'profile.vcf.bgz')
        print('files not found - downloading...', end='', flush=True)
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/profile.vcf.bgz',
            vcf)
        print('done', flush=True)
        print('importing...', end='', flush=True)
        hl.import_vcf(vcf, min_partitions=16).write(os.path.join(
            _data_dir, 'profile.mt'),
                                                    overwrite=True)

        ht = hl.utils.range_table(
            10_000_000,
            1000).annotate(**{f'f_{i}': hl.rand_unif(0, 1)
                              for i in range(5)})
        ht = ht.checkpoint(os.path.join(_data_dir, 'table_10M_par_1000.ht'),
                           overwrite=True)
        ht = ht.naive_coalesce(100).checkpoint(os.path.join(
            _data_dir, 'table_10M_par_100.ht'),
                                               overwrite=True)
        ht.naive_coalesce(10).write(os.path.join(_data_dir,
                                                 'table_10M_par_10.ht'),
                                    overwrite=True)

        mt = hl.utils.range_matrix_table(n_rows=250_000,
                                         n_cols=1_000,
                                         n_partitions=32)
        mt = mt.annotate_entries(x=hl.int(hl.rand_unif(0, 4.5)**3))
        mt.write(os.path.join(_data_dir, 'gnomad_dp_simulation.mt'),
                 overwrite=True)

        print('downloading many strings table...')
        mst_tsv = os.path.join(_data_dir, 'many_strings_table.tsv.bgz')
        mst_ht = os.path.join(_data_dir, 'many_strings_table.ht')
        urlretrieve(
            'https://storage.googleapis.com/hail-common/benchmark/many_strings_table.tsv.bgz',
            mst_tsv)
        print('importing...')
        hl.import_table(mst_tsv).write(mst_ht, overwrite=True)
        hl.stop()
    else:
        print('all files found.', flush=True)
Example #9
0
def load_prescription_data(prescription_data_tsv_path: str, prescription_mapping_tsv_path):
    ht = hl.import_table(prescription_data_tsv_path, types={'eid': hl.tint, 'data_provider': hl.tint}, key='eid')
    mapping_ht = hl.import_table(prescription_mapping_tsv_path, impute=True, key='Original_Prescription')
    ht = ht.annotate(issue_date=hl.cond(hl.len(ht.issue_date) == 0, hl.null(hl.tint64),
                                        hl.experimental.strptime(ht.issue_date + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')),
                     **mapping_ht[ht.drug_name])
    ht = ht.filter(ht.Generic_Name != '').key_by('eid', 'Generic_Name', 'Drug_Category_and_Indication').collect_by_key()
    ht = ht.annotate(values=hl.sorted(ht.values, key=lambda x: x.issue_date))
    return ht.to_matrix_table(row_key=['eid'], col_key=['Generic_Name'], col_fields=['Drug_Category_and_Indication'])
Example #10
0
 def _create(self, resource_dir):
     download(resource_dir, 'many_ints_table.tsv.bgz')
     logging.info('importing many_ints_table.tsv.bgz...')
     hl.import_table(os.path.join(resource_dir, 'many_ints_table.tsv.bgz'),
                     types={'idx': 'int',
                            **{f'i{i}': 'int' for i in range(5)},
                            **{f'array{i}': 'array<int>' for i in range(2)}}) \
         .write(os.path.join(resource_dir, 'many_ints_table.ht'), overwrite=True)
     logging.info('done importing many_ints_table.tsv.bgz.')
def prepare_gtex_expression_data(transcript_tpms_path, sample_annotations_path,
                                 tmp_path):
    # Recompress tpms file with block gzip so that import_matrix_table will read the file
    ds = hl.import_table(transcript_tpms_path, force=True)
    tmp_transcript_tpms_path = tmp_path + "/" + transcript_tpms_path.split(
        "/")[-1].replace(".gz", ".bgz")
    ds.export(tmp_transcript_tpms_path)

    # Import data
    ds = hl.import_matrix_table(
        tmp_transcript_tpms_path,
        row_fields={
            "transcript_id": hl.tstr,
            "gene_id": hl.tstr
        },
        entry_type=hl.tfloat,
    )
    ds = ds.rename({"col_id": "sample_id"})
    ds = ds.repartition(1000, shuffle=True)

    samples = hl.import_table(sample_annotations_path, key="SAMPID")

    # Separate version numbers from transcript and gene IDs
    ds = ds.annotate_rows(
        transcript_id=ds.transcript_id.split(r"\.")[0],
        transcript_version=hl.int(ds.transcript_id.split(r"\.")[1]),
        gene_id=ds.gene_id.split(r"\.")[0],
        gene_version=hl.int(ds.gene_id.split(r"\.")[1]),
    )

    # Annotate columns with the tissue the sample came from
    ds = ds.annotate_cols(tissue=samples[ds.sample_id].SMTSD)

    # Collect expression into median across all samples in each tissue
    ds = ds.group_cols_by(ds.tissue).aggregate(**{
        "": hl.agg.approx_median(ds.x)
    }).make_table()

    # Format tissue names
    other_fields = {
        "transcript_id", "transcript_version", "gene_id", "gene_version"
    }
    tissues = [f for f in ds.row_value.dtype.fields if f not in other_fields]
    ds = ds.transmute(tissues=hl.struct(
        **{format_tissue_name(tissue): ds[tissue]
           for tissue in tissues}))

    ds = ds.key_by("transcript_id").drop("row_id")

    return ds
Example #12
0
def remap_samples(
    original_mt_path: str,
    input_mt: hl.MatrixTable,
    pedigree: hl.Table,
    inferred_sex: str,
) -> Tuple[hl.MatrixTable, hl.Table]:
    """
    Rename `s` col in the MatrixTable and inferred sex ht.

    :param original_mt_path: Path to original MatrixTable location
    :param input_mt: MatrixTable 
    :param pedigree: Pedigree file from seqr loaded as a Hail Table
    :param inferred_sex: Path to text file of inferred sexes
    :return: mt and sex ht with sample names remapped
    """
    base_path = "/".join(
        dirname(original_mt_path).split("/")[:-1]) + ("/base/projects")
    project_list = list(set(pedigree.Project_GUID.collect()))

    # Get the list of hts containing sample remapping information for each project
    remap_hts = []

    logger.info("Found %d projects that need to be remapped.", len(remap_hts))
    sex_ht = hl.import_table(inferred_sex)

    for i in project_list:
        remap = f"{base_path}/{i}/{i}_remap.tsv"
        if hl.hadoop_is_file(remap):
            remap_ht = hl.import_table(remap)
            remap_ht = remap_ht.key_by("s", "seqr_id")
            remap_hts.append(remap_ht)

    if len(remap_hts) > 0:
        ht = remap_hts[0]
        for next_ht in remap_hts[1:]:
            ht = ht.join(next_ht, how="outer")

        # If a sample has a non-missing value for seqr_id, rename it to the sample name for the mt and sex ht
        ht = ht.key_by("s")
        input_mt = input_mt.annotate_cols(seqr_id=ht[input_mt.s].seqr_id)
        input_mt = input_mt.key_cols_by(s=hl.if_else(
            hl.is_missing(input_mt.seqr_id), input_mt.s, input_mt.seqr_id))

        sex_ht = sex_ht.annotate(seqr_id=ht[sex_ht.s].seqr_id).key_by("s")
        sex_ht = sex_ht.key_by(s=hl.if_else(hl.is_missing(sex_ht.seqr_id),
                                            sex_ht.s, sex_ht.seqr_id))
    else:
        sex_ht = sex_ht.key_by("s")

    return input_mt, sex_ht
Example #13
0
def get_all_sample_metadata(
    mt: hl.MatrixTable, build: int, data_type: str, data_source: str, version: int
) -> hl.Table:
    """
    Annotate MatrixTable with all current metadata: sample sequencing metrics, sample ID mapping,
    and callrate for bi-allelic, high-callrate common SNPs.
    :param MatrixTable mt: VCF converted to a MatrixTable
    :param int build: build for write, 37 or 38
    :param str data_type: WGS or WES for write path and flagging metrics
    :param str data_source: internal or external for write path
    :param int version: Int for write path
    :return: Table with seq metrics and mapping
    :rtype: Table
    """
    logger.info("Importing and annotating with sequencing metrics...")
    meta_ht = hl.import_table(
        seq_metrics_path(build, data_type, data_source, version), impute=True
    ).key_by("SAMPLE")

    logger.info("Importing and annotating seqr ID names...")
    remap_ht = hl.import_table(
        remap_path(build, data_type, data_source, version), impute=True
    ).key_by("s")
    meta_ht = meta_ht.annotate(**remap_ht[meta_ht.key])
    meta_ht = meta_ht.annotate(
        seqr_id=hl.if_else(
            hl.is_missing(meta_ht.seqr_id), meta_ht.SAMPLE, meta_ht.seqr_id
        )
    )

    logger.info(
        "Filtering to bi-allelic, high-callrate, common SNPs to calculate callrate..."
    )
    mt = filter_rows_for_qc(
        mt,
        min_af=0.001,
        min_callrate=0.99,
        bi_allelic_only=True,
        snv_only=True,
        apply_hard_filters=False,
        min_inbreeding_coeff_threshold=None,
        min_hardy_weinberg_threshold=None,
    )
    callrate_ht = mt.select_cols(
        filtered_callrate=hl.agg.fraction(hl.is_defined(mt.GT))
    ).cols()
    meta_ht = meta_ht.annotate(**callrate_ht[meta_ht.key])
    return meta_ht
Example #14
0
def get_gnomad_data(data_type: str, adj: bool = False, split: bool = True, raw: bool = False,
                    non_refs_only: bool = False, hail_version: str = CURRENT_HAIL_VERSION,
                    meta_version: str = None, meta_root: Optional[str] = 'meta', full_meta: bool = False,
                    fam_version: str = CURRENT_FAM, fam_root: str = None, duplicate_mapping_root: str = None,
                    release_samples: bool = False, release_annotations: bool = None) -> hl.MatrixTable:
    """
    Wrapper function to get gnomAD data as VDS. By default, returns split hardcalls (with adj annotated but not filtered)

    :param str data_type: One of `exomes` or `genomes`
    :param bool adj: Whether the returned data should be filtered to adj genotypes
    :param bool split: Whether the dataset should be split (only applies to raw=False)
    :param bool raw: Whether to return the raw (10T+) data (not recommended: unsplit, and no special consideration on sex chromosomes)
    :param bool non_refs_only: Whether to return the non-ref-genotype only MT (warning: no special consideration on sex chromosomes)
    :param str hail_version: One of the HAIL_VERSIONs
    :param str meta_version: Version of metadata (None for current)
    :param str meta_root: Where to put metadata. Set to None if no metadata is desired.
    :param str full_meta: Whether to add all metadata (warning: large)
    :param str fam_version: Version of metadata (default to current)
    :param str fam_root: Where to put the pedigree information. Set to None if no pedigree information is desired.
    :param str duplicate_mapping_root: Where to put the duplicate genome/exome samples ID mapping (default is None -- do not annotate)
    :param bool release_samples: When set, filters the data to release samples only
    :param str release_annotations: One of the RELEASES to add variant annotations (into va), or None for no data
    :return: gnomAD hardcalls dataset with chosen annotations
    :rtype: MatrixTable
    """
    from gnomad_hail.utils import filter_to_adj

    if raw and split:
        raise DataException('No split raw data. Use of hardcalls is recommended.')

    if non_refs_only:
        mt = hl.read_matrix_table(get_gnomad_data_path(data_type, split=split, non_refs_only=non_refs_only, hail_version=hail_version))
    else:
        mt = hl.read_matrix_table(get_gnomad_data_path(data_type, hardcalls=not raw, split=split, hail_version=hail_version))

    if adj:
        mt = filter_to_adj(mt)

    if meta_root:
        meta_ht = get_gnomad_meta(data_type, meta_version, full_meta=full_meta)
        mt = mt.annotate_cols(**{meta_root: meta_ht[mt.s]})

    if duplicate_mapping_root:
        dup_ht = hl.import_table(genomes_exomes_duplicate_ids_tsv_path, impute=True,
                                 key='exome_id' if data_type == "exomes" else 'genome_id')
        mt = mt.annotate_cols(**{duplicate_mapping_root: dup_ht[mt.s]})

    if fam_root:
        fam_ht = hl.import_fam(fam_path(data_type, fam_version))
        mt = mt.annotate_cols(**{fam_root: fam_ht[mt.s]})

    if release_samples:
        mt = mt.filter_cols(mt.meta.release)

    if release_annotations:
        sites_ht = get_gnomad_public_data(data_type, split)
        mt = mt.select_rows(**sites_ht[mt.row_key])
        mt = mt.select_globals(**sites_ht.index_globals())

    return mt
def specific_clumps(filename):
    clump = hl.import_table(filename, delimiter='\s+', min_partitions=10, types={'P': hl.tfloat})
    clump_dict = clump.aggregate(hl.dict(hl.agg.collect(
        (hl.locus(hl.str(clump.CHR), hl.int(clump.BP)),
        True)
    )), _localize=False)
    return clump_dict
Example #16
0
def main():
    vshl.init()
    print("Version info:")
    for (k, v) in vshl.version_info().items():
        print("%s=%s" % (k, v))

    data = hl.import_vcf(os.path.join(PROJECT_DIR, 'data/chr22_1000_GRCh38.vcf'),
                         reference_genome='GRCh38')
    labels = hl.import_table(os.path.join(PROJECT_DIR, 'data/chr22-labels-hail.csv'), delimiter=',',
                             types={'x22_16050408': 'float64'}).key_by('sample')

    mt = data.annotate_cols(hipster=labels[data.s])
    print(mt.count())

    rf_model = vshl.random_forest_model(y=mt.hipster.x22_16050408,
                                        x=mt.GT.n_alt_alleles(), seed=13, mtry_fraction=0.05,
                                        min_node_size=5, max_depth=10)
    rf_model.fit_trees(100, 50)

    print("OOB error: %s" % rf_model.oob_error())
    impTable = rf_model.variable_importance()
    impTable.show(3)

    rf_model.to_json(os.path.join(PROJECT_DIR, "target/chr22_1000_GRCh38-model.json"), True)

    rf_model.release()
def prepare_gene_results(gene_results_url, genes_url=None):
    ds = hl.import_table(
        gene_results_url,
        missing="",
        types={
            "gene_name": hl.tstr,
            "gene_id": hl.tstr,
            "description": hl.tstr,
            "analysis_group": hl.tstr,
            "xcase_dn_ptv": hl.tint,
            "xcont_dn_ptv": hl.tint,
            "xcase_dn_misa": hl.tint,
            "xcont_dn_misa": hl.tint,
            "xcase_dn_misb": hl.tint,
            "xcont_dn_misb": hl.tint,
            "xcase_dbs_ptv": hl.tint,
            "xcont_dbs_ptv": hl.tint,
            "xcase_swe_ptv": hl.tint,
            "xcont_swe_ptv": hl.tint,
            "xcase_tut": hl.tint,
            "xcont_tut": hl.tint,
            "qval": hl.tfloat,
        },
    )

    ds = ds.rename({"description": "gene_description"})

    if genes_url:
        genes = hl.read_table(genes_url)
        genes = genes.key_by("gene_id")
        ds = ds.annotate(chrom=genes[ds.gene_id].chrom,
                         pos=genes[ds.gene_id].start)

    return ds
Example #18
0
def import_SJ_out_tab(path):
    """
    column 1: chromosome
    column 2: first base of the intron (1-based)
    column 3: last base of the intron (1-based)
    column 4: strand (0: undefined, 1: +, 2: -)
    column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5:AT/AC, 6: GT/AT
    column 6: 0: unannotated, 1: annotated (only if splice junctions database is used)
    column 7: number of uniquely mapping reads crossing the junction
    column 8: number of multi-mapping reads crossing the junction
    column 9: maximum spliced alignment overhang
    """

    ht = hl.import_table(
        path,
        no_header=True,
        impute=True,
        force=True,
    ).rename({
        "f0": "chrom",
        "f1": "start_1based",
        "f2": "end_1based",
        "f3": "strand",
        "f4": "intron_motif",
        "f5": "known_splice_junction",
        "f6": "unique_reads",
        "f7": "multi_mapped_reads",
        "f8": "maximum_overhang",
    })

    return ht
Example #19
0
def compute_prs_mt(genotype_mt_path, prs_mt_path):
    scratch_dir = 'gs://ukbb-diverse-temp-30day/nb-scratch'

    clumped = hl.read_table(
        'gs://ukb-diverse-pops/ld_prune/results_high_quality/not_AMR/phecode-250.2-both_sexes/clump_results.ht/'
    )
    sumstats = hl.import_table(
        'gs://ukb-diverse-pops/sumstats_flat_files/phecode-250.2-both_sexes.tsv.bgz',
        impute=True)
    sumstats = sumstats.annotate(locus=hl.locus(sumstats.chr, sumstats.pos),
                                 alleles=hl.array([sumstats.ref,
                                                   sumstats.alt]))
    sumstats = sumstats.key_by('locus', 'alleles')
    sumstats.describe()
    #    mt = hl.read_matrix_table(genotype_mt_path) # read genotype mt subset

    # get full genotype mt
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    mt = get_filtered_mt_with_x()
    mt = mt.filter_rows(hl.is_defined(meta_mt.rows()[mt.row_key]))
    mt = mt.select_entries('dosage')
    mt = mt.select_rows()
    mt = mt.select_cols()

    mt = mt.annotate_rows(beta=hl.if_else(hl.is_defined(clumped[mt.row_key]),
                                          sumstats[mt.row_key].beta_meta, 0))
    mt = mt.annotate_cols(score=hl.agg.sum(mt.beta * mt.dosage))
    mt_cols = mt.cols()
    mt_cols = mt_cols.repartition(1000)
    mt_cols.write(f'{scratch_dir}/prs_all_samples.ht')
Example #20
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Example #21
0
    def remap_sample_ids(mt, remap_path):
        """
        Remap the MatrixTable's sample ID, 's', field to the sample ID used within seqr, 'seqr_id'
        If the sample 's' does not have a 'seqr_id' in the remap file, 's' becomes 'seqr_id'
        :param mt: MatrixTable from VCF
        :param remap_path: Path to a file with two columns 's' and 'seqr_id'
        :return: MatrixTable remapped and keyed to use seqr_id
        """
        remap_ht = hl.import_table(remap_path, key='s')
        missing_samples = remap_ht.anti_join(mt.cols()).collect()
        remap_count = remap_ht.count()

        if len(missing_samples) != 0:
            raise MatrixTableSampleSetError(
                f'Only {remap_ht.semi_join(mt.cols()).count()} out of {remap_count} '
                'remap IDs matched IDs in the variant callset.\n'
                f'IDs that aren\'t in the callset: {missing_samples}\n'
                f'All callset sample IDs:{mt.s.collect()}', missing_samples)

        mt = mt.annotate_cols(**remap_ht[mt.s])
        remap_expr = hl.cond(hl.is_missing(mt.seqr_id), mt.s, mt.seqr_id)
        mt = mt.annotate_cols(seqr_id=remap_expr, vcf_id=mt.s)
        mt = mt.key_cols_by(s=mt.seqr_id)
        logger.info(f'Remapped {remap_count} sample ids...')
        return mt
Example #22
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Example #23
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
def ref_filtering(ref_mt,
                  pass_mt,
                  unrel,
                  outliers,
                  pass_unrel_mt,
                  overwrite: bool = False):
    mt = hl.read_matrix_table(ref_mt)
    all_sample_filters = set(mt['sample_filters'])
    bad_sample_filters = {
        re.sub('fail_', '', x)
        for x in all_sample_filters if x.startswith('fail_')
    }
    mt_filt = mt.filter_cols(mt['sample_filters']['qc_metrics_filters'].
                             difference(bad_sample_filters).length() == 0)
    mt_filt = mt_filt.checkpoint(pass_mt,
                                 overwrite=False,
                                 _read_if_exists=True)

    mt_unrel = hl.read_matrix_table(unrel)

    mt_filt = mt_filt.filter_rows(
        mt_filt.filters.length() == 0)  # gnomAD QC pass variants
    mt_filt = mt_filt.filter_cols(hl.is_defined(
        mt_unrel.cols()[mt_filt.s]))  # only unrelated

    # remove outliers
    pca_outliers = hl.import_table(outliers).key_by('s')
    mt_filt = mt_filt.filter_cols(hl.is_missing(pca_outliers[mt_filt.s]))

    mt_filt.write(pass_unrel_mt, overwrite)
Example #25
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Example #26
0
def get_transcript_lof_metrics_ht() -> hl.Table:
    return hl.import_table(
        f"{nfs_dir}/resources/gnomad/gnomad.v2.1.1.lof_metrics.by_transcript.txt.bgz",
        min_partitions=100,
        impute=True,
        key='transcript'
    )
Example #27
0
def copmute_ldscore(ht, bm_ld, n, radius, out_name, overwrite):
    r2 = bm_ld**2
    r2_adj = ((n - 1.0) / (n - 2.0)) * r2 - (1.0 / (n - 2.0))

    # This is required, as the squaring/multiplication densifies, so this re-sparsifies.
    starts_and_stops = hl.linalg.utils.locus_windows(ht.locus,
                                                     radius,
                                                     _localize=False)
    r2_adj = r2_adj._sparsify_row_intervals_expr(starts_and_stops,
                                                 blocks_only=False)
    r2_adj = r2_adj.sparsify_triangle()
    r2_adj = checkpoint_tmp(r2_adj)

    # Note that the original ld matrix is triangular
    l2row = checkpoint_tmp(r2_adj.sum(axis=0)).T
    l2col = checkpoint_tmp(r2_adj.sum(axis=1))
    r2_diag = checkpoint_tmp(r2_adj.diagonal()).T
    l2 = l2row + l2col - r2_diag
    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_gs_temp_path()

    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index().rename({'f0': 'ld_score'})
    ht_scores = ht_scores.key_by('idx')
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx]).drop('idx')
    ht = ht.checkpoint(out_name, overwrite)
    return ht
Example #28
0
def prepare_gene_results():
    ds = hl.import_table(
        pipeline_config.get("ASC", "gene_results_path"),
        missing="",
        types={
            "gene_name": hl.tstr,
            "gene_id": hl.tstr,
            "description": hl.tstr,
            "analysis_group": hl.tstr,
            "xcase_dn_ptv": hl.tint,
            "xcont_dn_ptv": hl.tint,
            "xcase_dn_misa": hl.tint,
            "xcont_dn_misa": hl.tint,
            "xcase_dn_misb": hl.tint,
            "xcont_dn_misb": hl.tint,
            "xcase_dbs_ptv": hl.tint,
            "xcont_dbs_ptv": hl.tint,
            "xcase_swe_ptv": hl.tint,
            "xcont_swe_ptv": hl.tint,
            "xcase_tut": hl.tint,
            "xcont_tut": hl.tint,
            "qval": hl.tfloat,
        },
    )

    ds = ds.drop("gene_name", "description")

    ds = ds.group_by("gene_id").aggregate(
        group_results=hl.agg.collect(ds.row_value))
    ds = ds.annotate(group_results=hl.dict(
        ds.group_results.map(lambda group_result:
                             (group_result.analysis_group,
                              group_result.drop("analysis_group")))))

    return ds
Example #29
0
def get_gnomad_genomes_coverage_ht() -> hl.Table:
    return hl.import_table(
        f"{nfs_dir}/resources/gnomad/gnomad.genomes.r3.0.1.coverage.summary.tsv.bgz",
        min_partitions=1000,
        impute=True,
        key='locus'
    )
Example #30
0
def preprocess2(variant_set):
    print('\n##################')
    print(
        'Starting Pre-processing 2: Filtering variants table (variant_set: ' +
        variant_set + ')')
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    print('\n##################')

    variants = hl.read_table(
        'gs://nbaya/split/' + variant_set +
        '_variants.ht')  # for hm3: import table hapmap3_variants.tsv'

    mt = hl.read_matrix_table(
        'gs://phenotype_31063/hail/imputed/ukb31063.dosage.autosomes.mt')
    mt = mt.filter_rows(hl.is_defined(
        variants[mt.locus, mt.alleles]))  #filter to variant_set variants

    covs = hl.import_table(
        'gs://phenotype_31063/ukb31063.gwas_covariates.both_sexes.tsv',
        key='s',
        impute=True,
        types={'s': hl.tstr})

    mt = mt.annotate_cols(**covs[mt.s])
    mt = mt.filter_cols(hl.is_defined(mt.PC1), keep=True)

    mt.write('gs://nbaya/split/ukb31063.' + variant_set +
             '_variants.gwas_samples_prerepart.mt')

    print('\n##################')
    print(
        'Finished Pre-processing 2: Filtering variants table (variant_set: ' +
        variant_set + ')')
    print('Time: {:%H:%M:%S (%Y-%b-%d)}'.format(datetime.datetime.now()))
    print('\n##################')
Example #31
0
def main(args):
    add_args = {}
    if args.n_threads is not None:
        add_args['master'] = f'local[{args.n_threads}]'
    hl.init(default_reference='GRCh38', log='/load_finngen.log', **add_args)

    if args.load_single:
        ht = hl.import_table(args.input_file,
                             impute=True,
                             force_bgz=True,
                             min_partitions=100).rename({'#chrom': 'chrom'})
        ht = ht.transmute(locus=hl.locus('chr' + ht.chrom, ht.pos),
                          alleles=[ht.ref, ht.alt]).key_by('locus', 'alleles')
        ht = ht.transmute(Pvalue=ht.pval).annotate_globals(
            **json.loads(args.additional_dict))
        ht = ht.annotate(**get_vep_formatted_data(args.vep_path)[ht.key])
        ht = ht.checkpoint(args.output_ht,
                           overwrite=args.overwrite,
                           _read_if_exists=not args.overwrite)
        ht = ht.select_globals().annotate(**json.loads(args.additional_dict))
        mt = ht.to_matrix_table(
            ['locus', 'alleles'], ['phenocode'],
            ['rsids', 'nearest_genes', 'gene', 'annotation'],
            ['category', 'name', 'n_cases', 'n_controls'])
        mt.checkpoint(args.output_mt,
                      overwrite=args.overwrite,
                      _read_if_exists=not args.overwrite)

    if args.combine_all:
        # all_hts = list(filter(lambda y: y.endswith('.ht'), map(lambda x: x['path'], hl.hadoop_ls(args.input_directory))))
        # print(f'Got {len(all_hts)} HTs...')
        # mt = mwzj_hts_by_tree(all_hts, temp_bucket + '/finngen', ['phenocode'], debug=True)
        # mt.checkpoint(temp_mt_path, overwrite=args.overwrite, _read_if_exists=not args.overwrite)
        mt = hl.read_matrix_table(temp_mt_path)
        mt.naive_coalesce(5000).write(args.output_mt, args.overwrite)
Example #32
0
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file(
            "test_rg",
            resource("fake_reference.fasta"),
            resource("fake_reference.fasta.fai"),
            mt_contigs=["b", "c"],
            x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(
            hl.eval(
                t.all(
                    hl.get_sequence(t.contig, t.pos, reference_genome=gr4) ==
                    t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(
            hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))

        gr4.remove_sequence()
        assert not gr4.has_sequence()

        gr4.add_sequence(resource("fake_reference.fasta"),
                         resource("fake_reference.fasta.fai"))
        assert gr4.has_sequence()
Example #33
0
    def subset_samples_and_variants(mt, subset_path):
        """
        Subset the MatrixTable to the provided list of samples and to variants present in those samples
        :param mt: MatrixTable from VCF
        :param subset_path: Path to a file with a single column 's'
        :return: MatrixTable subsetted to list of samples
        """
        subset_ht = hl.import_table(subset_path, key='s')
        subset_count = subset_ht.count()
        anti_join_ht = subset_ht.anti_join(mt.cols())
        anti_join_ht_count = anti_join_ht.count()

        if anti_join_ht_count != 0:
            missing_samples = anti_join_ht.s.collect()
            raise MatrixTableSampleSetError(
                f'Only {subset_count-anti_join_ht_count} out of {subset_count} '
                'subsetting-table IDs matched IDs in the variant callset.\n'
                f'IDs that aren\'t in the callset: {missing_samples}\n'
                f'All callset sample IDs:{mt.s.collect()}', missing_samples)

        mt = mt.semi_join_cols(subset_ht)
        mt = mt.filter_rows((hl.agg.count_where(mt.GT.is_non_ref())) > 0)

        logger.info(f'Finished subsetting samples. Kept {anti_join_ht_count} '
                    f'out of {mt.count()} samples in vds')
        return mt
Example #34
0
def run_gwas(vcf_file, phenotypes_file, output_file):
    table = hl.import_table(phenotypes_file, impute=True).key_by('Sample')

    hl.import_vcf(vcf_file).write('tmp.mt')
    mt = hl.read_matrix_table('tmp.mt')

    mt = mt.annotate_cols(pheno=table[mt.s])

    downsampled = mt.sample_rows(0.01, seed=11223344)
    eigenvalues, pcs, _ = hl.hwe_normalized_pca(downsampled.GT)

    mt = mt.annotate_cols(scores=pcs[mt.s].scores)

    gwas = hl.linear_regression_rows(
        y=mt.pheno.CaffeineConsumption,
        x=mt.GT.n_alt_alleles(),
        covariates=[1.0, mt.scores[0], mt.scores[1], mt.scores[2]])

    gwas = gwas.select(SNP=hl.variant_str(gwas.locus, gwas.alleles),
                       P=gwas.p_value)
    gwas = gwas.key_by(gwas.SNP)
    gwas = gwas.select(gwas.P)
    gwas.export(f'{output_file}.assoc', header=True)

    hl.export_plink(mt, output_file, fam_id=mt.s, ind_id=mt.s)
Example #35
0
    def test_import_table_force_bgz(self):
        f = new_temp_file(suffix=".bgz")
        t = hl.utils.range_table(10, 5)
        t.export(f)

        f2 = new_temp_file(suffix=".gz")
        run_command(["cp", uri_path(f), uri_path(f2)])
        t2 = hl.import_table(f2, force_bgz=True, impute=True).key_by('idx')
        self.assertTrue(t._same(t2))
Example #36
0
    def test_linear_mixed_regression_pass_through(self):
        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()
        p_path = utils.new_temp_file()

        mt_chr1 = mt.filter_rows((mt.locus.contig == '1') & (mt.locus.position < 200))
        model, _ = hl.linear_mixed_model(y=mt_chr1.y, x=[1, mt_chr1.x], z_t=mt_chr1.GT.n_alt_alleles(), p_path=p_path)
        model.fit(log_gamma=0)

        mt_chr3 = mt.filter_rows((mt.locus.contig == '3') & (mt.locus.position < 2005))
        mt_chr3 = mt_chr3.annotate_rows(stats=hl.agg.stats(mt_chr3.GT.n_alt_alleles()), foo=hl.struct(bar=hl.rand_norm(0, 1)))
        ht = hl.linear_mixed_regression_rows((mt_chr3.GT.n_alt_alleles() - mt_chr3.stats.mean) / mt_chr3.stats.stdev,
                                             model, pass_through=['stats', mt_chr3.foo.bar, mt_chr3.cm_position])

        assert mt_chr3.aggregate_rows(hl.agg.all(mt_chr3.foo.bar == ht[mt_chr3.row_key].bar))
Example #37
0
    def test_reference_genome_sequence(self):
        gr3 = ReferenceGenome.read(resource("fake_ref_genome.json"))
        self.assertEqual(gr3.name, "my_reference_genome")
        self.assertFalse(gr3.has_sequence())

        gr4 = ReferenceGenome.from_fasta_file("test_rg", resource("fake_reference.fasta"),
                                              resource("fake_reference.fasta.fai"),
                                              mt_contigs=["b", "c"], x_contigs=["a"])
        self.assertTrue(gr4.has_sequence())
        self.assertTrue(gr4.x_contigs == ["a"])

        t = hl.import_table(resource("fake_reference.tsv"), impute=True)
        self.assertTrue(hl.eval(t.all(hl.get_sequence(t.contig, t.pos, reference_genome=gr4) == t.base)))

        l = hl.locus("a", 7, gr4)
        self.assertTrue(hl.eval(l.sequence_context(before=3, after=3) == "TTTCGAA"))
Example #38
0
def init(doctest_namespace):
    # This gets run once per process -- must avoid race conditions
    print("setting up doctest...")

    olddir = os.getcwd()
    os.chdir("docs/")

    doctest_namespace['hl'] = hl
    doctest_namespace['agg'] = agg

    if not os.path.isdir("output/"):
        try:
            os.mkdir("output/")
        except OSError:
            pass

    files = ["sample.vds", "sample.qc.vds", "sample.filtered.vds"]
    for f in files:
        if os.path.isdir(f):
            shutil.rmtree(f)

    ds = hl.read_matrix_table('data/example.vds')
    doctest_namespace['ds'] = ds
    doctest_namespace['dataset'] = ds
    doctest_namespace['dataset2'] = ds.annotate_globals(global_field=5)
    doctest_namespace['dataset_to_union_1'] = ds
    doctest_namespace['dataset_to_union_2'] = ds

    v_metadata = ds.rows().annotate_globals(global_field=5).annotate(consequence='SYN')
    doctest_namespace['v_metadata'] = v_metadata

    s_metadata = ds.cols().annotate(pop='AMR', is_case=False, sex='F')
    doctest_namespace['s_metadata'] = s_metadata

    # Table
    table1 = hl.import_table('data/kt_example1.tsv', impute=True, key='ID')
    table1 = table1.annotate_globals(global_field_1=5, global_field_2=10)
    doctest_namespace['table1'] = table1
    doctest_namespace['other_table'] = table1

    table2 = hl.import_table('data/kt_example2.tsv', impute=True, key='ID')
    doctest_namespace['table2'] = table2

    table4 = hl.import_table('data/kt_example4.tsv', impute=True,
                             types={'B': hl.tstruct(B0=hl.tbool, B1=hl.tstr),
                                    'D': hl.tstruct(cat=hl.tint32, dog=hl.tint32),
                                    'E': hl.tstruct(A=hl.tint32, B=hl.tint32)})
    doctest_namespace['table4'] = table4

    people_table = hl.import_table('data/explode_example.tsv', delimiter='\\s+',
                                   types={'Age': hl.tint32, 'Children': hl.tarray(hl.tstr)},
                                   key='Name')
    doctest_namespace['people_table'] = people_table

    # TDT
    doctest_namespace['tdt_dataset'] = hl.import_vcf('data/tdt_tiny.vcf')

    ds2 = hl.variant_qc(ds)
    doctest_namespace['ds2'] = ds2.select_rows(AF=ds2.variant_qc.AF)

    # Expressions
    doctest_namespace['names'] = hl.literal(['Alice', 'Bob', 'Charlie'])
    doctest_namespace['a1'] = hl.literal([0, 1, 2, 3, 4, 5])
    doctest_namespace['a2'] = hl.literal([1, -1, 1, -1, 1, -1])
    doctest_namespace['t'] = hl.literal(True)
    doctest_namespace['f'] = hl.literal(False)
    doctest_namespace['na'] = hl.null(hl.tbool)
    doctest_namespace['call'] = hl.call(0, 1, phased=False)
    doctest_namespace['a'] = hl.literal([1, 2, 3, 4, 5])
    doctest_namespace['d'] = hl.literal({'Alice': 43, 'Bob': 33, 'Charles': 44})
    doctest_namespace['interval'] = hl.interval(3, 11)
    doctest_namespace['locus_interval'] = hl.parse_locus_interval("1:53242-90543")
    doctest_namespace['locus'] = hl.locus('1', 1034245)
    doctest_namespace['x'] = hl.literal(3)
    doctest_namespace['y'] = hl.literal(4.5)
    doctest_namespace['s1'] = hl.literal({1, 2, 3})
    doctest_namespace['s2'] = hl.literal({1, 3, 5})
    doctest_namespace['s3'] = hl.literal({'Alice', 'Bob', 'Charlie'})
    doctest_namespace['struct'] = hl.struct(a=5, b='Foo')
    doctest_namespace['tup'] = hl.literal(("a", 1, [1, 2, 3]))
    doctest_namespace['s'] = hl.literal('The quick brown fox')
    doctest_namespace['interval2'] = hl.Interval(3, 6)

    # Overview
    doctest_namespace['ht'] = hl.import_table("data/kt_example1.tsv", impute=True)
    doctest_namespace['mt'] = ds

    gnomad_data = ds.rows()
    doctest_namespace['gnomad_data'] = gnomad_data.select(gnomad_data.info.AF)

    # BGEN
    bgen = hl.import_bgen('data/example.8bits.bgen',
                          entry_fields=['GT', 'GP', 'dosage'])
    doctest_namespace['variants_table'] = bgen.rows()

    print("finished setting up doctest...")
    yield
    os.chdir(olddir)
Example #39
0
def import_gtf(path, reference_genome=None, skip_invalid_contigs=False, min_partitions=None) -> hl.Table:
    """Import a GTF file.

       The GTF file format is identical to the GFF version 2 file format,
       and so this function can be used to import GFF version 2 files as
       well.

       See https://www.ensembl.org/info/website/upload/gff.html for more
       details on the GTF/GFF2 file format.

       The :class:`.Table` returned by this function will be keyed by the
       ``interval`` row field and will include the following row fields:

       .. code-block:: text

           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'interval': interval<>

       There will also be corresponding fields for every tag found in the
       attribute field of the GTF file.

       Note
       ----

       This function will return an ``interval`` field of type :class:`.tinterval`
       constructed from the ``seqname``, ``start``, and ``end`` fields in the
       GTF file. This interval is inclusive of both the start and end positions
       in the GTF file. 

       If the ``reference_genome`` parameter is specified, the start and end
       points of the ``interval`` field will be of type :class:`.tlocus`.
       Otherwise, the start and end points of the ``interval`` field will be of
       type :class:`.tstruct` with fields ``seqname`` (type :class:`str`) and
       ``position`` (type :class:`.tint32`).

       Furthermore, if the ``reference_genome`` parameter is specified and
       ``skip_invalid_contigs`` is ``True``, this import function will skip
       lines in the GTF where ``seqname`` is not consistent with the reference
       genome specified.

       Example
       -------

       >>> ht = hl.experimental.import_gtf('data/test.gtf', 
       ...                                 reference_genome='GRCh37',
       ...                                 skip_invalid_contigs=True)

       >>> ht.describe()  # doctest: +NOTEST
       ----------------------------------------
       Global fields:
       None
       ----------------------------------------
       Row fields:
           'source': str
           'feature': str
           'score': float64
           'strand': str
           'frame': int32
           'gene_type': str
           'exon_id': str
           'havana_transcript': str
           'level': str
           'transcript_name': str
           'gene_status': str
           'gene_id': str
           'transcript_type': str
           'tag': str
           'transcript_status': str
           'gene_name': str
           'transcript_id': str
           'exon_number': str
           'havana_gene': str
           'interval': interval<locus<GRCh37>>
       ----------------------------------------
       Key: ['interval']
       ----------------------------------------

       Parameters
       ----------

       path : :obj:`str`
           File to import.
       reference_genome : :obj:`str` or :class:`.ReferenceGenome`, optional
           Reference genome to use.
       skip_invalid_contigs : :obj:`bool`
           If ``True`` and `reference_genome` is not ``None``, skip lines where
           ``seqname`` is not consistent with the reference genome.
       min_partitions : :obj:`int` or :obj:`None`
           Minimum number of partitions (passed to import_table).

       Returns
       -------
       :class:`.Table`
       """

    ht = hl.import_table(path,
                         min_partitions=min_partitions,
                         comment='#',
                         no_header=True,
                         types={'f3': hl.tint,
                                'f4': hl.tint,
                                'f5': hl.tfloat,
                                'f7': hl.tint},
                         missing='.',
                         delimiter='\t')

    ht = ht.rename({'f0': 'seqname',
                    'f1': 'source',
                    'f2': 'feature',
                    'f3': 'start',
                    'f4': 'end',
                    'f5': 'score',
                    'f6': 'strand',
                    'f7': 'frame',
                    'f8': 'attribute'})

    ht = ht.annotate(attribute=hl.dict(
        hl.map(lambda x: (x.split(' ')[0],
                          x.split(' ')[1].replace('"', '').replace(';$', '')),
               ht['attribute'].split('; '))))

    attributes = ht.aggregate(hl.agg.explode(lambda x: hl.agg.collect_as_set(x), ht['attribute'].keys()))

    ht = ht.transmute(**{x: hl.or_missing(ht['attribute'].contains(x),
                                          ht['attribute'][x])
                         for x in attributes if x})

    if reference_genome:
        if reference_genome == 'GRCh37':
            ht = ht.annotate(seqname=ht['seqname'].replace('^chr', ''))
        else:
            ht = ht.annotate(seqname=hl.case()
                                       .when(ht['seqname'].startswith('HLA'), ht['seqname'])
                                       .when(ht['seqname'].startswith('chrHLA'), ht['seqname'].replace('^chr', ''))
                                       .when(ht['seqname'].startswith('chr'), ht['seqname'])
                                       .default('chr' + ht['seqname']))
        if skip_invalid_contigs:
            valid_contigs = hl.literal(set(hl.get_reference(reference_genome).contigs))
            ht = ht.filter(valid_contigs.contains(ht['seqname']))
        ht = ht.transmute(interval=hl.locus_interval(ht['seqname'],
                                                     ht['start'],
                                                     ht['end'],
                                                     includes_start=True,
                                                     includes_end=True,
                                                     reference_genome=reference_genome))
    else:
        ht = ht.transmute(interval=hl.interval(hl.struct(seqname=ht['seqname'], position=ht['start']),
                                               hl.struct(seqname=ht['seqname'], position=ht['end']),
                                               includes_start=True,
                                               includes_end=True))

    ht = ht.key_by('interval')

    return ht
Example #40
0
    def test_export_plink_exprs(self):
        ds = get_dataset()
        fam_mapping = {'f0': 'fam_id', 'f1': 'ind_id', 'f2': 'pat_id', 'f3': 'mat_id',
                       'f4': 'is_female', 'f5': 'pheno'}
        bim_mapping = {'f0': 'contig', 'f1': 'varid', 'f2': 'cm_position',
                       'f3': 'position', 'f4': 'a1', 'f5': 'a2'}

        # Test default arguments
        out1 = new_temp_file()
        hl.export_plink(ds, out1)
        fam1 = (hl.import_table(out1 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))
        bim1 = (hl.import_table(out1 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(fam1.all((fam1.fam_id == "0") & (fam1.pat_id == "0") &
                                 (fam1.mat_id == "0") & (fam1.is_female == "0") &
                                 (fam1.pheno == "NA")))
        self.assertTrue(bim1.all((bim1.varid == bim1.contig + ":" + bim1.position + ":" + bim1.a2 + ":" + bim1.a1) &
                                 (bim1.cm_position == "0.0")))

        # Test non-default FAM arguments
        out2 = new_temp_file()
        hl.export_plink(ds, out2, ind_id=ds.s, fam_id=ds.s, pat_id="nope",
                        mat_id="nada", is_female=True, pheno=False)
        fam2 = (hl.import_table(out2 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam2.all((fam2.fam_id == fam2.ind_id) & (fam2.pat_id == "nope") &
                                 (fam2.mat_id == "nada") & (fam2.is_female == "2") &
                                 (fam2.pheno == "1")))

        # Test quantitative phenotype
        out3 = new_temp_file()
        hl.export_plink(ds, out3, ind_id=ds.s, pheno=hl.float64(hl.len(ds.s)))
        fam3 = (hl.import_table(out3 + '.fam', no_header=True, impute=False, missing="")
                .rename(fam_mapping))

        self.assertTrue(fam3.all((fam3.fam_id == "0") & (fam3.pat_id == "0") &
                                 (fam3.mat_id == "0") & (fam3.is_female == "0") &
                                 (fam3.pheno != "0") & (fam3.pheno != "NA")))

        # Test non-default BIM arguments
        out4 = new_temp_file()
        hl.export_plink(ds, out4, varid="hello", cm_position=100)
        bim4 = (hl.import_table(out4 + '.bim', no_header=True, impute=False)
                .rename(bim_mapping))

        self.assertTrue(bim4.all((bim4.varid == "hello") & (bim4.cm_position == "100.0")))

        # Test call expr
        out5 = new_temp_file()
        ds_call = ds.annotate_entries(gt_fake=hl.call(0, 0))
        hl.export_plink(ds_call, out5, call=ds_call.gt_fake)
        ds_all_hom_ref = hl.import_plink(out5 + '.bed', out5 + '.bim', out5 + '.fam')
        nerrors = ds_all_hom_ref.aggregate_entries(hl.agg.count_where(~ds_all_hom_ref.GT.is_hom_ref()))
        self.assertTrue(nerrors == 0)

        # Test white-space in FAM id expr raises error
        with self.assertRaisesRegex(TypeError, "has spaces in the following values:"):
            hl.export_plink(ds, new_temp_file(), mat_id="hello world")

        # Test white-space in varid expr raises error
        with self.assertRaisesRegex(FatalError, "no white space allowed:"):
            hl.export_plink(ds, new_temp_file(), varid="hello world")
Example #41
0
    def test_ld_score_regression(self):

        ht_scores = hl.import_table(
            doctest_resource('ld_score_regression.univariate_ld_scores.tsv'),
            key='SNP', types={'L2': hl.tfloat, 'BP': hl.tint})

        ht_50_irnt = hl.import_table(
            doctest_resource('ld_score_regression.50_irnt.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_50_irnt = ht_50_irnt.annotate(
            chi_squared=ht_50_irnt['Z']**2,
            n=ht_50_irnt['N'],
            ld_score=ht_scores[ht_50_irnt['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_50_irnt['SNP']]['CHR'],
                           ht_scores[ht_50_irnt['SNP']]['BP']),
            alleles=hl.array([ht_50_irnt['A2'], ht_50_irnt['A1']]),
            phenotype='50_irnt')

        ht_50_irnt = ht_50_irnt.key_by(ht_50_irnt['locus'],
                                       ht_50_irnt['alleles'])

        ht_50_irnt = ht_50_irnt.select(ht_50_irnt['chi_squared'],
                                       ht_50_irnt['n'],
                                       ht_50_irnt['ld_score'],
                                       ht_50_irnt['phenotype'])

        ht_20160 = hl.import_table(
            doctest_resource('ld_score_regression.20160.sumstats.tsv'),
            key='SNP', types={'N': hl.tint, 'Z': hl.tfloat})

        ht_20160 = ht_20160.annotate(
            chi_squared=ht_20160['Z']**2,
            n=ht_20160['N'],
            ld_score=ht_scores[ht_20160['SNP']]['L2'],
            locus=hl.locus(ht_scores[ht_20160['SNP']]['CHR'],
                           ht_scores[ht_20160['SNP']]['BP']),
            alleles=hl.array([ht_20160['A2'], ht_20160['A1']]),
            phenotype='20160')

        ht_20160 = ht_20160.key_by(ht_20160['locus'],
                                   ht_20160['alleles'])

        ht_20160 = ht_20160.select(ht_20160['chi_squared'],
                                   ht_20160['n'],
                                   ht_20160['ld_score'],
                                   ht_20160['phenotype'])

        ht = ht_50_irnt.union(ht_20160)
        mt = ht.to_matrix_table(row_key=['locus', 'alleles'],
                                col_key=['phenotype'],
                                row_fields=['ld_score'],
                                col_fields=[])

        mt_tmp = new_temp_file()
        mt.write(mt_tmp, overwrite=True)
        mt = hl.read_matrix_table(mt_tmp)

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=mt['ld_score'],
            ld_score_expr=mt['ld_score'],
            chi_sq_exprs=mt['chi_squared'],
            n_samples_exprs=mt['n'],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results['50_irnt']['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results['50_irnt']['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results['20160']['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results['20160']['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results['20160']['snp_heritability_standard_error'],
            0.0416, places=4)

        ht = ht_50_irnt.annotate(
            chi_squared_50_irnt=ht_50_irnt['chi_squared'],
            n_50_irnt=ht_50_irnt['n'],
            chi_squared_20160=ht_20160[ht_50_irnt.key]['chi_squared'],
            n_20160=ht_20160[ht_50_irnt.key]['n'])

        ht_results = hl.experimental.ld_score_regression(
            weight_expr=ht['ld_score'],
            ld_score_expr=ht['ld_score'],
            chi_sq_exprs=[ht['chi_squared_50_irnt'],
                               ht['chi_squared_20160']],
            n_samples_exprs=[ht['n_50_irnt'],
                             ht['n_20160']],
            n_blocks=20,
            two_step_threshold=5,
            n_reference_panel_variants=1173569)

        results = {
            x['phenotype']: {
                'mean_chi_sq': x['mean_chi_sq'],
                'intercept_estimate': x['intercept']['estimate'],
                'intercept_standard_error': x['intercept']['standard_error'],
                'snp_heritability_estimate': x['snp_heritability']['estimate'],
                'snp_heritability_standard_error':
                    x['snp_heritability']['standard_error']}
            for x in ht_results.collect()}

        self.assertAlmostEqual(
            results[0]['mean_chi_sq'],
            3.4386, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_estimate'],
            0.7727, places=4)
        self.assertAlmostEqual(
            results[0]['intercept_standard_error'],
            0.2461, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_estimate'],
            0.3845, places=4)
        self.assertAlmostEqual(
            results[0]['snp_heritability_standard_error'],
            0.1067, places=4)

        self.assertAlmostEqual(
            results[1]['mean_chi_sq'],
            1.5209, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_estimate'],
            1.2109, places=4)
        self.assertAlmostEqual(
            results[1]['intercept_standard_error'],
            0.2238, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_estimate'],
            0.0486, places=4)
        self.assertAlmostEqual(
            results[1]['snp_heritability_standard_error'],
            0.0416, places=4)
Example #42
0
                                      age=hl.rand_norm(65, 10),
                                      height=hl.rand_norm(70, 10),
                                      blood_pressure=hl.rand_norm(120, 20),
                                      cohort_name="cohort1"),
                      cov=hl.struct(PC1=hl.rand_norm(0, 1)),
                      cov1=hl.rand_norm(0, 1),
                      cov2=hl.rand_norm(0, 1),
                      cohort="SIGMA")
ds = ds.annotate_globals(global_field_1=5,
                         global_field_2=10,
                         pli={'SCN1A': 0.999, 'SONIC': 0.014},
                         populations=['AFR', 'EAS', 'EUR', 'SAS', 'AMR', 'HIS'])
ds = ds.annotate_rows(gene=['TTN'])
ds = ds.annotate_cols(cohorts=['1kg'], pop='EAS')
ds.write('data/example.vds', overwrite=True)

lmmreg_ds = hl.variant_qc(hl.split_multi_hts(hl.import_vcf('data/sample.vcf.bgz')))
lmmreg_tsv = hl.import_table('data/example_lmmreg.tsv', 'Sample', impute=True)
lmmreg_ds = lmmreg_ds.annotate_cols(**lmmreg_tsv[lmmreg_ds['s']])
lmmreg_ds = lmmreg_ds.annotate_rows(use_in_kinship = lmmreg_ds.variant_qc.AF[1] > 0.05)
lmmreg_ds.write('data/example_lmmreg.vds', overwrite=True)

burden_ds = hl.import_vcf('data/example_burden.vcf')
burden_kt = hl.import_table('data/example_burden.tsv', key='Sample', impute=True)
burden_ds = burden_ds.annotate_cols(burden = burden_kt[burden_ds.s])
burden_ds = burden_ds.annotate_rows(weight = hl.float64(burden_ds.locus.position))
burden_ds = hl.variant_qc(burden_ds)
genekt = hl.import_locus_intervals('data/gene.interval_list')
burden_ds = burden_ds.annotate_rows(gene=genekt[burden_ds.locus])
burden_ds.write('data/example_burden.vds', overwrite=True)
Example #43
0
def ld_score(entry_expr,
             locus_expr,
             radius,
             coord_expr=None,
             annotation_exprs=None,
             block_size=None) -> Table:
    """Calculate LD scores.

    Example
    -------

    >>> # Load genetic data into MatrixTable
    >>> mt = hl.import_plink(bed='data/ldsc.bed',
    ...                      bim='data/ldsc.bim',
    ...                      fam='data/ldsc.fam')

    >>> # Create locus-keyed Table with numeric variant annotations
    >>> ht = hl.import_table('data/ldsc.annot',
    ...                      types={'BP': hl.tint,
    ...                             'binary': hl.tfloat,
    ...                             'continuous': hl.tfloat})
    >>> ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
    >>> ht = ht.key_by('locus')

    >>> # Annotate MatrixTable with external annotations
    >>> mt = mt.annotate_rows(binary_annotation=ht[mt.locus].binary,
    ...                       continuous_annotation=ht[mt.locus].continuous)

    >>> # Calculate LD scores using centimorgan coordinates
    >>> ht_scores = hl.experimental.ld_score(entry_expr=mt.GT.n_alt_alleles(),
    ...                                      locus_expr=mt.locus,
    ...                                      radius=1.0,
    ...                                      coord_expr=mt.cm_position,
    ...                                      annotation_exprs=[mt.binary_annotation,
    ...                                                        mt.continuous_annotation])

    >>> # Show results
    >>> ht_scores.show(3)

    .. code-block:: text

        +---------------+-------------------+-----------------------+-------------+
        | locus         | binary_annotation | continuous_annotation |  univariate |
        +---------------+-------------------+-----------------------+-------------+
        | locus<GRCh37> |           float64 |               float64 |     float64 |
        +---------------+-------------------+-----------------------+-------------+
        | 20:82079      |       1.15183e+00 |           7.30145e+01 | 1.60117e+00 |
        | 20:103517     |       2.04604e+00 |           2.75392e+02 | 4.69239e+00 |
        | 20:108286     |       2.06585e+00 |           2.86453e+02 | 5.00124e+00 |
        +---------------+-------------------+-----------------------+-------------+


    Warning
    -------
        :func:`.ld_score` will fail if ``entry_expr`` results in any missing
        values. The special float value ``nan`` is not considered a
        missing value.

    **Further reading**

    For more in-depth discussion of LD scores, see:

    - `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__
    - `Partitioning heritability by functional annotation using genome-wide association summary statistics (Finucane et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4626285/>`__

    Notes
    -----

    `entry_expr`, `locus_expr`, `coord_expr` (if specified), and
    `annotation_exprs` (if specified) must come from the same
    MatrixTable.


    Parameters
    ----------
    entry_expr : :class:`.NumericExpression`
        Expression for entries of genotype matrix
        (e.g. ``mt.GT.n_alt_alleles()``).
    locus_expr : :class:`.LocusExpression`
        Row-indexed locus expression.
    radius : :obj:`int` or :obj:`float`
        Radius of window for row values (in units of `coord_expr` if set,
        otherwise in units of basepairs).
    coord_expr: :class:`.Float64Expression`, optional
        Row-indexed numeric expression for the row value used to window
        variants. By default, the row value is given by the locus
        position.
    annotation_exprs : :class:`.NumericExpression` or
                       :obj:`list` of :class:`.NumericExpression`, optional
        Annotation expression(s) to partition LD scores. Univariate
        annotation will always be included and does not need to be
        specified.
    block_size : :obj:`int`, optional
        Block size. Default given by :meth:`.BlockMatrix.default_block_size`.

    Returns
    -------
    :class:`.Table`
        Table keyed by `locus_expr` with LD scores for each variant and
        `annotation_expr`. The function will always return LD scores for
        the univariate (all SNPs) annotation."""

    mt = entry_expr._indices.source
    mt_locus_expr = locus_expr._indices.source

    if coord_expr is None:
        mt_coord_expr = mt_locus_expr
    else:
        mt_coord_expr = coord_expr._indices.source

    if not annotation_exprs:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr])
    else:
        check_mts = all([mt == mt_locus_expr,
                         mt == mt_coord_expr] +
                        [mt == x._indices.source
                         for x in wrap_to_list(annotation_exprs)])

    if not check_mts:
        raise ValueError("""ld_score: entry_expr, locus_expr, coord_expr
                            (if specified), and annotation_exprs (if
                            specified) must come from same MatrixTable.""")

    n = mt.count_cols()
    r2 = hl.row_correlation(entry_expr, block_size) ** 2
    r2_adj = ((n-1.0) / (n-2.0)) * r2 - (1.0 / (n-2.0))

    starts, stops = hl.linalg.utils.locus_windows(locus_expr,
                                                  radius,
                                                  coord_expr)
    r2_adj_sparse = r2_adj.sparsify_row_intervals(starts, stops)

    r2_adj_sparse_tmp = new_temp_file()
    r2_adj_sparse.write(r2_adj_sparse_tmp)
    r2_adj_sparse = BlockMatrix.read(r2_adj_sparse_tmp)

    if not annotation_exprs:
        cols = ['univariate']
        col_idxs = {0: 'univariate'}
        l2 = r2_adj_sparse.sum(axis=1)
    else:
        ht = mt.select_rows(*wrap_to_list(annotation_exprs)).rows()
        ht = ht.annotate(univariate=hl.literal(1.0))
        names = [name for name in ht.row if name not in ht.key]

        ht_union = hl.Table.union(
            *[(ht.annotate(name=hl.str(x),
                           value=hl.float(ht[x]))
                 .select('name', 'value')) for x in names])
        mt_annotations = ht_union.to_matrix_table(
            row_key=list(ht_union.key),
            col_key=['name'])

        cols = mt_annotations.key_cols_by()['name'].collect()
        col_idxs = {i: cols[i] for i in range(len(cols))}

        a_tmp = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt_annotations.value, a_tmp)

        a = BlockMatrix.read(a_tmp)
        l2 = r2_adj_sparse @ a

    l2_bm_tmp = new_temp_file()
    l2_tsv_tmp = new_temp_file()
    l2.write(l2_bm_tmp, force_row_major=True)
    BlockMatrix.export(l2_bm_tmp, l2_tsv_tmp)

    ht_scores = hl.import_table(l2_tsv_tmp, no_header=True, impute=True)
    ht_scores = ht_scores.add_index()
    ht_scores = ht_scores.key_by('idx')
    ht_scores = ht_scores.rename({'f{:}'.format(i): col_idxs[i]
                                  for i in range(len(cols))})

    ht = mt.select_rows(__locus=locus_expr).rows()
    ht = ht.add_index()
    ht = ht.annotate(**ht_scores[ht.idx])
    ht = ht.key_by('__locus')
    ht = ht.select(*[x for x in ht_scores.row if x not in ht_scores.key])
    ht = ht.rename({'__locus': 'locus'})

    return ht
Example #44
0
    def test_linear_mixed_model_fastlmm(self):
        # FastLMM Test data is from all.bed, all.bim, all.fam, cov.txt, pheno_10_causals.txt:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/tree/master/tests/datasets/synth
        #
        # Data is filtered to chromosome 1,3 and samples 0-124,375-499 (2000 variants and 250 samples)
        #
        # Results are computed with single_snp (with LOCO) as in:
        #   https://github.com/MicrosoftGenomics/FaST-LMM/blob/master/doc/ipynb/FaST-LMM.ipynb

        n, m = 250, 1000  # per chromosome

        x_table = hl.import_table(resource('fastlmmCov.txt'), no_header=True, impute=True).key_by('f1')
        y_table = hl.import_table(resource('fastlmmPheno.txt'), no_header=True, impute=True, delimiter=' ').key_by('f1')

        mt = hl.import_plink(bed=resource('fastlmmTest.bed'),
                             bim=resource('fastlmmTest.bim'),
                             fam=resource('fastlmmTest.fam'),
                             reference_genome=None)
        mt = mt.annotate_cols(x=x_table[mt.col_key].f2)
        mt = mt.annotate_cols(y=y_table[mt.col_key].f2).cache()

        x = np.array([np.ones(n), mt.key_cols_by()['x'].collect()]).T
        y = np.array(mt.key_cols_by()['y'].collect())

        mt_chr1 = mt.filter_rows(mt.locus.contig == '1')
        mt_chr3 = mt.filter_rows(mt.locus.contig == '3')

        # testing chrom 1 for h2, betas, p-values
        h2_fastlmm = 0.14276125
        beta_fastlmm = [0.012202061, 0.037718282, -0.033572693, 0.29171541, -0.045644170]

        # FastLMM p-values do not agree to high precision because FastLMM regresses
        # out x from each SNP first and does an F(1, dof)-test on (beta / se)^2
        # (t-test), whereas Hail does likelihood ratio test.
        # We verify below that Hail's p-values remain fixed going forward.
        # fastlmm = [0.84650294, 0.57865098, 0.59050998, 1.6649473e-06, 0.46892059]
        pval_hail = [0.84543084, 0.57596760, 0.58788517, 1.4057279e-06, 0.46578204]

        gamma_fastlmm = h2_fastlmm / (1 - h2_fastlmm)

        g = BlockMatrix.from_entry_expr(mt_chr1.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.13770773  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        mt3_chr3_5var = mt_chr3.filter_rows(mt_chr3.locus.position < 2005)  # first 5
        a = BlockMatrix.from_entry_expr(mt3_chr3_5var.GT.n_alt_alleles()).to_numpy().T

        # FastLMM standardizes each variant to have mean 0 and variance 1.
        a = self._filter_and_standardize_cols(a) * np.sqrt(n)
        pa = p @ a

        model.fit(log_gamma=np.log(gamma_fastlmm))

        res = model.fit_alternatives_numpy(pa, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # low rank
        ld = g_std.T @ g_std
        sl, v = np.linalg.eigh(ld)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        model.fit(log_gamma=np.log(gamma_fastlmm))

        pa = p @ a
        res = model.fit_alternatives_numpy(pa, a, return_pandas=True)

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        a_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(a.T).write(a_t_path, force_row_major=True)

        pa_t_path = utils.new_temp_file(suffix='bm')
        BlockMatrix.from_numpy(pa.T).write(pa_t_path, force_row_major=True)

        res = model.fit_alternatives(pa_t_path, a_t_path).to_pandas()

        assert np.allclose(res['beta'], beta_fastlmm)
        assert np.allclose(res['p_value'], pval_hail)

        # testing chrom 3 for h2
        h2_fastlmm = 0.36733240

        g = BlockMatrix.from_entry_expr(mt_chr3.GT.n_alt_alleles()).to_numpy().T
        g_std = self._filter_and_standardize_cols(g)

        # full rank
        k = (g_std @ g_std.T) * (n / m)
        s, u = np.linalg.eigh(k)
        p = u.T
        model = LinearMixedModel(p @ y, p @ x, s)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)

        h2_std_error = 0.17409641  # hard coded having checked against plot
        assert np.isclose(model.h_sq_standard_error, h2_std_error)

        h_sq_norm_lkhd = model.h_sq_normalized_lkhd()[1:-1]
        argmax = int(100 * h2_fastlmm)
        assert argmax <= np.argmax(h_sq_norm_lkhd) + 1 <= argmax + 1
        assert np.isclose(np.sum(h_sq_norm_lkhd), 1.0)

        # low rank
        l = g_std.T @ g_std
        sl, v = np.linalg.eigh(l)
        n_eigenvectors = int(np.sum(sl > 1e-10))
        assert n_eigenvectors < n
        sl = sl[-n_eigenvectors:]
        v = v[:, -n_eigenvectors:]
        s = sl * (n / m)
        p = (g_std @ (v / np.sqrt(sl))).T
        model = LinearMixedModel(p @ y, p @ x, s, y, x)
        model.fit()

        assert np.isclose(model.h_sq, h2_fastlmm)
        assert np.isclose(model.h_sq_standard_error, h2_std_error)
Example #45
0
    def test_ld_score(self):

        ht = hl.import_table(doctest_resource('ldsc.annot'),
                             types={'BP': hl.tint,
                                    'CM': hl.tfloat,
                                    'binary': hl.tint,
                                    'continuous': hl.tfloat})
        ht = ht.annotate(locus=hl.locus(ht.CHR, ht.BP))
        ht = ht.key_by('locus')

        mt = hl.import_plink(bed=doctest_resource('ldsc.bed'),
                             bim=doctest_resource('ldsc.bim'),
                             fam=doctest_resource('ldsc.fam'))
        mt = mt.annotate_rows(binary=ht[mt.locus].binary,
                              continuous=ht[mt.locus].continuous)

        ht_univariate = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position)

        ht_annotated = hl.experimental.ld_score(
            entry_expr=mt.GT.n_alt_alleles(),
            locus_expr=mt.locus,
            radius=1.0,
            coord_expr=mt.cm_position,
            annotation_exprs=[mt.binary,
                              mt.continuous])

        univariate = ht_univariate.aggregate(hl.struct(
            chr20=hl.agg.filter(
                (ht_univariate.locus.contig == '20') &
                (ht_univariate.locus.position == 82079),
                hl.agg.collect(ht_univariate.univariate))[0],
            chr22 =hl.agg.filter(
                (ht_univariate.locus.contig == '22') &
                (ht_univariate.locus.position == 16894090),
                hl.agg.collect(ht_univariate.univariate))[0],
            mean=hl.agg.mean(ht_univariate.univariate)))

        self.assertAlmostEqual(univariate.chr20, 1.601, places=3)
        self.assertAlmostEqual(univariate.chr22, 1.140, places=3)
        self.assertAlmostEqual(univariate.mean, 3.507, places=3)

        annotated = ht_annotated.aggregate(
            hl.struct(
                chr20=hl.struct(binary=hl.agg.filter(
                    (ht_annotated.locus.contig == '20') &
                    (ht_annotated.locus.position == 82079),
                    hl.agg.collect(ht_annotated.binary))[0],
                                continuous=hl.agg.filter(
                                    (ht_annotated.locus.contig == '20') &
                                    (ht_annotated.locus.position == 82079),
                                    hl.agg.collect(ht_annotated.continuous))[0]),
                chr22=hl.struct(
                    binary=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.binary))[0],
                    continuous=hl.agg.filter(
                        (ht_annotated.locus.contig == '22') &
                        (ht_annotated.locus.position == 16894090),
                        hl.agg.collect(ht_annotated.continuous))[0]),
                mean_stats=hl.struct(binary=hl.agg.mean(ht_annotated.binary),
                                     continuous=hl.agg.mean(ht_annotated.continuous))))

        self.assertAlmostEqual(annotated.chr20.binary, 1.152, places=3)
        self.assertAlmostEqual(annotated.chr20.continuous, 73.014, places=3)
        self.assertAlmostEqual(annotated.chr22.binary, 1.107, places=3)
        self.assertAlmostEqual(annotated.chr22.continuous, 102.174, places=3)
        self.assertAlmostEqual(annotated.mean_stats.binary, 0.965, places=3)
        self.assertAlmostEqual(annotated.mean_stats.continuous, 176.528, places=3)
Example #46
0
 def test_read_back_same_as_exported(self):
     t, _ = create_all_values_datasets()
     tmp_file = new_temp_file(prefix="test", suffix=".tsv")
     t.export(tmp_file)
     t_read_back = hl.import_table(tmp_file, types=dict(t.row.dtype)).key_by('idx')
     self.assertTrue(t.select_globals()._same(t_read_back, tolerance=1e-4, absolute=True))
Example #47
0
def get_movie_lens(output_dir, overwrite: bool = False):
    """Download public Movie Lens dataset.

    Notes
    -----
    The download is about 6M.

    See the
    `MovieLens website <https://grouplens.org/datasets/movielens/100k/>`__
    for more information about this dataset.

    Parameters
    ----------
    output_dir
        Directory in which to write data.
    overwrite
        If ``True``, overwrite existing files/directories at those locations.
    """

    jhc = Env.hc()._jhc

    _mkdir(jhc, output_dir)

    paths = [os.path.join(output_dir, x) for x in ['movies.ht', 'ratings.ht', 'users.ht']]
    if overwrite or any(not Env.jutils().dirExists(jhc, f) for f in paths):
        init_temp_dir()
        source = resources['movie_lens_100k']
        tmp_path = os.path.join(tmp_dir, 'ml-100k.zip')
        info(f'downloading MovieLens-100k data ...\n'
             f'  Source: {source}')
        urlretrieve(source, tmp_path)
        with zipfile.ZipFile(tmp_path, 'r') as z:
            z.extractall(tmp_dir)

        user_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.user'))
        movie_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.item'))
        ratings_table_path = os.path.join(os.path.join(tmp_dir, 'ml-100k', 'u.data'))
        assert (os.path.exists(user_table_path))
        assert (os.path.exists(movie_table_path))
        assert (os.path.exists(ratings_table_path))

        user_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(user_table_path), 'txt')
        movie_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(movie_table_path), 'txt')
        ratings_cluster_readable = Env.jutils().copyToTmp(jhc, local_path_uri(ratings_table_path), 'txt')

        [movies_path, ratings_path, users_path] = paths

        genres = ['Action', 'Adventure', 'Animation',
                  "Children's", 'Comedy', 'Crime',
                  'Documentary', 'Drama', 'Fantasy',
                  'Film-Noir', 'Horror', 'Musical',
                  'Mystery', 'Romance', 'Sci-Fi',
                  'Thriller', 'War', 'Western']

        # utility functions for importing movies
        def field_to_array(ds, field):
            return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))

        def fields_to_array(ds, fields):
            return hl.flatten(hl.array([field_to_array(ds, f) for f in fields]))

        def rename_columns(ht, new_names):
            return ht.rename({k: v for k, v in zip(ht.row, new_names)})

        info(f'importing users table and writing to {users_path} ...')

        users = rename_columns(
            hl.import_table(user_cluster_readable, key=['f0'], no_header=True, impute=True, delimiter='|'),
            ['id', 'age', 'sex', 'occupation', 'zipcode'])
        users.write(users_path, overwrite=True)

        info(f'importing movies table and writing to {movies_path} ...')

        movies = hl.import_table(movie_cluster_readable, key=['f0'], no_header=True, impute=True, delimiter='|')
        movies = rename_columns(movies,
                                ['id', 'title', 'release date', 'video release date', 'IMDb URL', 'unknown'] + genres)
        movies = movies.drop('release date', 'video release date', 'unknown', 'IMDb URL')
        movies = movies.transmute(genres=fields_to_array(movies, genres))
        movies.write(movies_path, overwrite=True)

        info(f'importing ratings table and writing to {ratings_path} ...')

        ratings = hl.import_table(ratings_cluster_readable, no_header=True, impute=True)
        ratings = rename_columns(ratings,
                                 ['user_id', 'movie_id', 'rating', 'timestamp'])
        ratings = ratings.drop('timestamp')
        ratings.write(ratings_path, overwrite=True)

    else:
        info('Movie Lens files found!')