def get_info(split: bool = True) -> VersionedTableResource: """ Gets the gnomAD v3 info TableResource :param version: Version of annotation path to return :param split: Whether to return the split or multi-allelic version of the resource :return: gnomAD v3 info VersionedTableResource """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource(path="{}/gnomad_genomes_v{}_info{}.ht".format( _annotations_root(release), release, ".split" if split else "")) for release in RELEASES }, )
def get_rf( data: str = "rf_result", run_hash: Optional[str] = None, ) -> Union[str, TableResource]: """ Gets the path to the desired RF data. Data can take the following values: - 'training': path to the training data for a given run - 'model': path to pyspark pipeline RF model - 'rf_result' (default): path to HT containing result of RF filtering :param str data: One of 'training', 'model' or 'rf_result' (default) :param str run_hash: Hash of RF run to load :return: Path to desired RF data """ if data == "model": return f"{tmp_dir}/models/{run_hash}/{data}.model" else: return TableResource(f"{tmp_dir}/models/{run_hash}/{data}.ht")
def coverage(data_type: str) -> VersionedTableResource: """ Retrieves gnomAD's coverage table by data_type :param data_type: One of "exomes" or "genomes" :return: Coverage Table """ if data_type not in DATA_TYPES: raise DataException(f'{data_type} not in {DATA_TYPES}, please select a data type from {DATA_TYPES}') if data_type == "exomes": current_release = CURRENT_EXOME_RELEASE releases = EXOME_RELEASES else: current_release = CURRENT_GENOME_RELEASE releases = GENOME_RELEASES return VersionedTableResource( current_release, {release: TableResource(path=_public_coverage_ht_path(data_type, release)) for release in releases}, )
def _import_clinvar(**kwargs) -> hl.Table: clinvar = import_sites_vcf(**kwargs) clinvar = clinvar.filter( hl.len(clinvar.alleles) > 1 ) # Get around problematic single entry in alleles array in the clinvar vcf clinvar = vep_or_lookup_vep(clinvar, reference="GRCh38") return clinvar # Resources with no versioning needed purcell_5k_intervals = TableResource( path= "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht", import_func=_import_purcell_5k, import_args={ "path": "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.interval_list", }, ) na12878_giab = MatrixTableResource( path= "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt", import_func=hl.import_vcf, import_args={ "path": "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh38",
def ld_scores(pop: str) -> TableResource: """Get resource for the LD scores for the given population.""" return TableResource(path=_ld_scores_path("genomes", pop))
""" Gets the path to the finalized sample metadata information after sample QC :param version: gnomAD release version :param meta_version: metadata version to return :return: String path to the finalized metadata """ return ( f"{_meta_root_path(version)}/gnomad_v{version}_metadata_v{meta_version}.tsv.gz" ) _meta_versions = { "3.1": TableResource( path= "gs://gnomad/metadata/genomes_v3.1/gnomad_v3.1_sample_qc_metadata.ht"), "3": TableResource( path="gs://gnomad/metadata/genomes_v3/gnomad_v3_metadata_2019-09-27.ht" ), } _project_meta_versions = { "3.1": TableResource( path="gs://gnomad/metadata/genomes_v3.1/v3.1_project_meta.ht"), "3": TableResource( path="gs://gnomad/metadata/genomes_v3/09-09-2019_v3_project_meta.ht", import_func=hl.import_table,
na12878_giab = MatrixTableResource( path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt", import_func=hl.import_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, ) hapmap = TableResource( path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht", import_func=import_sites_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, ) kgp_omni = TableResource( path="gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.ht", import_func=import_sites_vcf, import_args={ "path": "gs://gnomad-public/resources/grch37/kgp/1000G_omni2.5.b37.vcf.bgz", "force_bgz": True, "min_partitions": 100, "reference_genome": "GRCh37", }, )
def ld_index(pop: str) -> TableResource: """Get resource for the LD indices for the given population.""" return TableResource(path=_ld_index_path("genomes", pop))
version: str = CURRENT_RELEASE) -> str: """ Provides the path to the transmitted singleton VCF used as input to VQSR :param bool adj: Whether to use adj genotypes :param version: Version of transmitted singleton VCF path to return :return: """ return f'{_annotations_root(version)}/transmitted_singletons_{"adj" if adj else "raw"}.vcf.bgz' last_END_position = VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{_annotations_root(release)}/gnomad_genomes_v{release}_last_END_positions.ht" ) for release in RELEASES }, ) freq = VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{_annotations_root(release)}/gnomad_genomes_v{release}.frequencies.ht" ) for release in RELEASES }, )
:param split: Whether to return the split or multi-allelic version of the resource :return: gnomAD v3 info TableResource """ path = '{}/gnomad_genomes_v3_info{}.ht'.format(ANNOTATIONS_ROOT, '.split' if split else '') return TableResource(path) def get_filters(model_id: str, split: bool = True) -> TableResource: """ Gets the specified filtering annotation resource. :param model_id: Filtering model id :param split: Split or multi-allelic version of the filtering file :return: Filtering annotation file """ path = '{}/{}_filtering{}.ht'.format(ANNOTATIONS_ROOT, model_id, '.split' if split else '') return TableResource(path) last_END_position = TableResource( f'{ANNOTATIONS_ROOT}/gnomad_genomes_v3_last_END_positions.ht') freq = TableResource(f'{ANNOTATIONS_ROOT}/gnomad_genomes_v3.frequencies.ht') qual_hist = TableResource( f'{ANNOTATIONS_ROOT}/gnomad_genomes_v3.qual_hists.ht') vep = TableResource(f'{ANNOTATIONS_ROOT}/gnomad_genomes_v3_vep.ht') info_vcf_path = f'{ANNOTATIONS_ROOT}/gnomad_genomes_v3_info.vcf.bgz' qc_ac = TableResource(f'{ANNOTATIONS_ROOT}/gnomad_genomes_qc_ac.ht') fam_stats = TableResource(f'{ANNOTATIONS_ROOT}/gnomad_genomes_qc_fam_stats.ht')
def ld_index(pop: str) -> TableResource: return TableResource(path=_ld_index_path('genomes', pop))
def vep(data_type) -> TableResource: return TableResource(path=_annotations_ht_path(data_type, 'vep'))
def omes_by_platform_concordance(data_type) -> TableResource: return TableResource( path=_annotations_ht_path(data_type, 'omes_by_platform_concordance'))
}, ) def get_rf_result(model_id: Optional[str] = None) -> VersionedTableResource: """ Get the results of RF filtering for a given run :param model_id: RF run to load :return: VersionedTableResource for RF filtered data """ return VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_variant_qc_root(release)}/rf/models/{model_id}/rf_result.ht" ) for release in RELEASES }, ) final_filter = VersionedTableResource( CURRENT_RELEASE, { release: TableResource(f"{get_variant_qc_root(release)}/final_filter.ht") for release in RELEASES }, )
def syndip_concordance(data_type) -> TableResource: return TableResource( path=_annotations_ht_path(data_type, 'syndip_concordance'))
def NA12878_concordance(data_type) -> TableResource: return TableResource( path=_annotations_ht_path(data_type, 'NA12878_concordance'))
def rf(data_type) -> TableResource: return TableResource(path=_annotations_ht_path(data_type, 'rf'))
def frequencies(data_type) -> TableResource: return TableResource(path=_annotations_ht_path(data_type, 'frequencies'))
def family_stats(data_type) -> TableResource: return TableResource(path=_annotations_ht_path(data_type, 'family_stats'))
def get_score_quantile_bins(model_id: str, aggregated: bool) -> TableResource: return TableResource('{}/{}.{}.ht'.format( f"{tmp_dir}", model_id, 'binned' if aggregated else 'rank'))
""" relatedness table annotated with get_relationship_expr. :return: Annotated relatedness table """ relatedness_ht = relatedness.ht() return relatedness_ht.annotate(relationship=get_relationship_expr( kin_expr=relatedness_ht.kin, ibd0_expr=relatedness_ht.ibd0, ibd1_expr=relatedness_ht.ibd1, ibd2_expr=relatedness_ht.ibd2, )) # QC Sites (gnomAD v2 QC sites, lifted over) gnomad_v2_qc_sites = TableResource( "gs://gnomad-public/resources/grch38/gnomad_v2_qc_sites_b38.ht") # Dense MT of samples at QC sites qc = VersionedMatrixTableResource( CURRENT_RELEASE, { release: MatrixTableResource( f"gs://gnomad/sample_qc/mt/genomes_v{release}/gnomad_v{release}_qc_mt_v2_sites_dense.mt" ) for release in RELEASES }) # PC relate PCA scores pc_relate_pca_scores = VersionedTableResource( CURRENT_RELEASE, { release: TableResource( f"{get_sample_qc_root(release)}/gnomad_v{release}_qc_mt_v2_sites_pc_scores.ht"
def ld_scores(pop: str) -> TableResource: return TableResource(path=_ld_scores_path('genomes', pop))
import hail as hl from gnomad.resources.resource_utils import (TableResource, PedigreeResource, VersionedPedigreeResource) # Samples metadata META_ROOT = "gs://gnomad/metadata/genomes_v3" meta = TableResource(f'{META_ROOT}/gnomad_v3_metadata_2019-09-27.ht') meta_tsv_path = f'{META_ROOT}/gnomad_v3_metadata_2019-09-27.tsv.gz' project_meta = TableResource(import_func=hl.import_table, import_args={ 'path': f'{META_ROOT}/09-09-2019_v3_project_meta.txt', 'impute': True, 'key': 's', 'min_partitions': 100 }) pedigree = VersionedPedigreeResource( 'final', # TODO: Make sure "final" is the best label once the family scripts are in { 'raw': PedigreeResource(f'{META_ROOT}/gnomad_v3_raw.fam', delimiter="\t"), 'final': PedigreeResource(f'{META_ROOT}/gnomad_v3.fam', delimiter="\t") }) trios = VersionedPedigreeResource( # TODO: Should this be merged with Pedigree into a single resource? 'final', # TODO: Make sure "final" is the best label once the family scripts are in { 'raw': PedigreeResource(f'{META_ROOT}/gnomad_v3_trios_raw.fam'), 'final': PedigreeResource(f'{META_ROOT}/gnomad_v3_trios.fam') } )