Esempio n. 1
0
def get_callset_truth_data(
        truth_sample: str,
        mt: bool = True) -> Union[MatrixTableResource, TableResource]:
    """
    Get resources for the truth sample data that is subset from the full callset

    If `mt` this will return the truth sample MatrixTable (subset from callset); otherwise it returns the
    merged truth sample Table that includes both the truth data and the data from the callset

    :param str truth_sample: Name of the truth sample
    :param bool mt: Whether path is for a MatrixTable, default is True
    :return: Path to callset truth sample MT
    :rtype: str
    """
    if mt:
        return VersionedMatrixTableResource(
            CURRENT_RELEASE,
            {
                release: MatrixTableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.mt"
                )
                for release in RELEASES
            },
        )
    else:
        return VersionedTableResource(
            CURRENT_RELEASE,
            {
                release: TableResource(
                    f"{get_variant_qc_root(release)}/truth_samples/{truth_sample}.ht"
                )
                for release in RELEASES
            },
        )
Esempio n. 2
0
def hgdp_1kg_subset(dense: bool = False) -> VersionedMatrixTableResource:
    """
    Get the HGDP + 1KG subset release MatrixTableResource.

    :param dense: If True, return the dense MT; if False, return the sparse MT
    :return: MatrixTableResource for specific subset
    """

    return VersionedMatrixTableResource(
        CURRENT_RELEASE,
        {
            release: MatrixTableResource(
                f"gs://gnomad/release/{release}/mt/gnomad.genomes.v{release}.hgdp_1kg_subset{f'_dense' if dense else '_sparse'}.mt"
            )
            for release in RELEASES
            if release != "3"
        },
    )
Esempio n. 3
0
CURRENT_EXOME_RELEASE = ""
CURRENT_GENOME_RELEASE = "3.0"
CURRENT_GENOME_COVERAGE_RELEASE = "3.0.1"
EXOME_RELEASES = []
GENOME_RELEASES = ["3.0"]
GENOME_COVERAGE_RELEASES = GENOME_RELEASES + ["3.0.1"]
DATA_TYPES = ["genomes"]

GENOME_POPS = ["AFR", "AMI", "AMR", "ASJ", "EAS", "FIN", "NFE", "SAS", "OTH"]

gnomad_syndip = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0":
        MatrixTableResource(
            path=
            "gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_syndip.b38.mt")
    },
)

na12878 = VersionedMatrixTableResource(
    default_version="3.0",
    versions={
        "3.0":
        MatrixTableResource(
            path="gs://gnomad-public/truth-sets/hail-0.2/gnomad_v3_na12878.mt")
    },
)

Esempio n. 4
0
        GnomadPublicTableResource(
            path=
            "gs://gnomad-public-requester-pays/resources/context/grch38_context_vep_annotated.v101.ht",
        ),
    },
)

syndip = VersionedMatrixTableResource(
    default_version="20180222",
    versions={
        "20180222":
        GnomadPublicMatrixTableResource(
            path=
            "gs://gnomad-public-requester-pays/resources/grch38/syndip/syndip.b38_20180222.mt",
            import_func=hl.import_vcf,
            import_args={
                "path":
                "gs://gnomad-public-requester-pays/resources/grch38/syndip/full.38.20180222.vcf.gz",
                "force_bgz": True,
                "min_partitions": 100,
                "reference_genome": "GRCh38",
            },
        )
    },
)

syndip_hc_intervals = VersionedTableResource(
    default_version="20180222",
    versions={
        "20180222":
        GnomadPublicTableResource(
            path=
Esempio n. 5
0
            mixed_site=(hl.len(mt.alleles) > 2)
            & hl.any(lambda a: hl.is_indel(mt.alleles[0], a), mt.alleles[1:])
            & hl.any(lambda a: hl.is_snp(mt.alleles[0], a), mt.alleles[1:]),
        )
        mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)

    return mt


gnomad_v3_genotypes = VersionedMatrixTableResource(
    CURRENT_RELEASE,
    {
        "3":
        MatrixTableResource(
            "gs://gnomad/raw/hail-0.2/mt/genomes_v3/gnomad_genomes_v3.repartitioned.mt"
        ),
        "3.1":
        MatrixTableResource(
            "gs://gnomad/raw/genomes/3.1/gnomad_v3.1_sparse_unsplit.repartitioned.mt"
        ),
    },
)


def qc_temp_prefix(version: str = CURRENT_RELEASE) -> str:
    """
    Returns path to temporary QC bucket.

    :param version: Version of annotation path to return
    :return: Path to bucket with temporary QC data
    """
)

kgp_phase_3 = VersionedMatrixTableResource(
    default_version="phase_3_split",
    versions={
        "phase_3_split": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.split.mt",
            import_func=hl.import_vcf,
            import_args={
                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 300,
                "reference_genome": "GRCh37",
            },
        ),
        "phase_3": GnomadPublicMatrixTableResource(
            path="gs://gnomad-public-requester-pays/resources/grch37/kgp/1000Genomes_phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.mt",
            import_func=hl.import_vcf,
            import_args={
                "path": "gs://genomics-public-data/1000-genomes-phase-3/vcf-20150220/ALL.chr*.phase3_shapeit2_mvncall_integrated_v5a.20130502.genotypes.vcf",
                "force_bgz": True,
                "skip_invalid_loci": True,
                "min_partitions": 300,
                "reference_genome": "GRCh37",
            },
        ),
    },
)

kgp = VersionedTableResource(
Esempio n. 7
0
        kin_expr=relatedness_ht.kin,
        ibd0_expr=relatedness_ht.ibd0,
        ibd1_expr=relatedness_ht.ibd1,
        ibd2_expr=relatedness_ht.ibd2,
    ))


# QC Sites (gnomAD v2 QC sites, lifted over)
gnomad_v2_qc_sites = TableResource(
    "gs://gnomad-public/resources/grch38/gnomad_v2_qc_sites_b38.ht")

# Dense MT of samples at QC sites
qc = VersionedMatrixTableResource(
    CURRENT_RELEASE, {
        release: MatrixTableResource(
            f"gs://gnomad/sample_qc/mt/genomes_v{release}/gnomad_v{release}_qc_mt_v2_sites_dense.mt"
        )
        for release in RELEASES
    })

# PC relate PCA scores
pc_relate_pca_scores = VersionedTableResource(
    CURRENT_RELEASE, {
        release: TableResource(
            f"{get_sample_qc_root(release)}/gnomad_v{release}_qc_mt_v2_sites_pc_scores.ht"
        )
        for release in RELEASES
    })

# PC relate results
relatedness = VersionedTableResource(