Esempio n. 1
0
def get_callset_truth_data(
        truth_sample: str,
        mt: bool = True) -> Union[MatrixTableResource, TableResource]:
    """
    Get resources for the truth sample data that is subset from the full callset

    If `mt` this will return the truth sample MatrixTable (subset from callset); otherwise it returns the
    merged truth sample Table that includes both the truth data and the data from the callset

    :param str truth_sample: Name of the truth sample
    :param bool mt: Whether path is for a MatrixTable, default is True
    :return: Path to callset truth sample MT
    :rtype: str
    """
    if mt:
        return VersionedMatrixTableResource(
            CURRENT_RELEASE,
            {
                release: MatrixTableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.mt"
                )
                for release in RELEASES
            },
        )
    else:
        return VersionedTableResource(
            CURRENT_RELEASE,
            {
                release: TableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.ht"
                )
                for release in RELEASES
            },
        )
Esempio n. 2
0
def hgdp_1kg_subset(dense: bool = False) -> VersionedMatrixTableResource:
    """
    Get the HGDP + 1KG subset release MatrixTableResource.

    :param dense: If True, return the dense MT; if False, return the sparse MT
    :return: MatrixTableResource for specific subset
    """

    return VersionedMatrixTableResource(
        CURRENT_RELEASE,
        {
            release: MatrixTableResource(
                f"gs://gnomad/release/{release}/mt/gnomad.genomes.v{release}.hgdp_1kg_subset{f'_dense' if dense else '_sparse'}.mt"
            )
            for release in RELEASES
            if release != "3"
        },
    )
Esempio n. 3
0
CURRENT_EXOME_RELEASE = ""
CURRENT_GENOME_RELEASE = "3.0"
CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"
EXOME_RELEASES = []
GENOME_RELEASES = ["3.0"]
GENOME_COVERAGE_RELEASES = GENOME_RELEASES + ["3.0.1"]
DATA_TYPES = ["genomes"]

GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]

gnomad_syndip = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0":
        MatrixTableResource(
            path=
            "gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt")
    },
)

na12878 = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0":
        MatrixTableResource(
            path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_na12878.mt")
    },
)


def _public_release_ht_path(data_type: str, version: str) -> str:
Esempio n. 4
0
purcell_5k_intervals = TableResource(
    path=
    "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.ht",
    import_func=_import_purcell_5k,
    import_args={
        "path":
        "gs://gnomad-public/resources/grch38/purcell_5k_intervals/purcell5k.interval_list",
    },
)

na12878_giab = MatrixTableResource(
    path=
    "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.mt",
    import_func=hl.import_vcf,
    import_args={
        "path":
        "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_PGandRTGphasetransfer.vcf.gz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh38",
    },
)

na12878_giab_hc_intervals = TableResource(
    path=
    "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7_hc_regions.ht",
    import_func=hl.import_bed,
    import_args={
        "path":
        "gs://gnomad-public/resources/grch38/na12878/HG001_GRCh38_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-10X-SOLID_CHROM1-X_v.3.3.2_highconf_nosomaticdel_noCENorHET7.bed",
        "reference_genome": "GRCh38",
        "skip_invalid_intervals": True,
Esempio n. 5
0
            n_unsplit_alleles=hl.len(mt.alleles),
            mixed_site=(hl.len(mt.alleles) > 2)
            & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
            & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]),
        )
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    return mt


gnomad_v3_genotypes = VersionedMatrixTableResource(
    CURRENT_RELEASE,
    {
        "3":
        MatrixTableResource(
            "gs://gnomad/raw/hail-0.2/mt/genomes_v3/gnomad_genomes_v3.repartitioned.mt"
        ),
        "3.1":
        MatrixTableResource(
            "gs://gnomad/raw/genomes/3.1/gnomad_v3.1_sparse_unsplit.repartitioned.mt"
        ),
    },
)


def qc_temp_prefix(version: str = CURRENT_RELEASE) -> str:
    """
    Returns path to temporary QC bucket.

    :param version: Version of annotation path to return
    :return: Path to bucket with temporary QC data
Esempio n. 6
0
from gnomad.resources.resource_utils import (
    MatrixTableResource,
    TableResource,
    VersionedMatrixTableResource,
    VersionedTableResource,
    import_sites_vcf,
)
import hail as hl

na12878_giab = MatrixTableResource(
    path="gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.mt",
    import_func=hl.import_vcf,
    import_args={
        "path": "gs://gnomad-public/resources/grch37/na12878/NA12878_GIAB_highconf_CG-IllFB-IllGATKHC-Ion-Solid-10X_CHROM1-X_v3.3_highconf.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)

hapmap = TableResource(
    path="gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.ht",
    import_func=import_sites_vcf,
    import_args={
        "path": "gs://gnomad-public/resources/grch37/hapmap/hapmap_3.3.b37.vcf.bgz",
        "force_bgz": True,
        "min_partitions": 100,
        "reference_genome": "GRCh37",
    },
)
Esempio n. 7
0
        kin_expr=relatedness_ht.kin,
        ibd0_expr=relatedness_ht.ibd0,
        ibd1_expr=relatedness_ht.ibd1,
        ibd2_expr=relatedness_ht.ibd2,
    ))


# QC Sites (gnomAD v2 QC sites, lifted over)
gnomad_v2_qc_sites = TableResource(
    "gs://gnomad-public/resources/grch38/gnomad_v2_qc_sites_b38.ht")

# Dense MT of samples at QC sites
qc = VersionedMatrixTableResource(
    CURRENT_RELEASE, {
        release: MatrixTableResource(
            f"gs://gnomad/sample_qc/mt/genomes_v{release}/gnomad_v{release}_qc_mt_v2_sites_dense.mt"
        )
        for release in RELEASES
    })

# PC relate PCA scores
pc_relate_pca_scores = VersionedTableResource(
    CURRENT_RELEASE, {
        release: TableResource(
            f"{get_sample_qc_root(release)}/gnomad_v{release}_qc_mt_v2_sites_pc_scores.ht"
        )
        for release in RELEASES
    })

# PC relate results
relatedness = VersionedTableResource(