Exemple #1
0
def add_exac_from_vds(hail_context, vds, genome_version, root="va.exac", top_level_fields=TOP_LEVEL_FIELDS, info_fields=INFO_FIELDS, verbose=True):
    if genome_version == "37":
        exac_vds_path = 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vds'
    elif genome_version == "38":
        exac_vds_path = 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    exac_vds = hail_context.read(exac_vds_path).split_multi()

    top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=top_level_fields,
        other_source_root="vds",
    )
    if verbose:
        print(top_fields_expr)

    info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )
    if verbose:
        print(info_fields_expr)

    return (vds
        .annotate_variants_vds(exac_vds, expr=top_fields_expr)
        .annotate_variants_vds(exac_vds, expr=info_fields_expr)
    )
Exemple #2
0
def add_mpc_from_vds(hail_context,
                     vds,
                     genome_version,
                     root="va.mpc",
                     info_fields=MPC_INFO_FIELDS,
                     verbose=True):
    """Add MPC annotations [Samocha 2017] to the vds"""

    if genome_version == "37":
        mpc_vds_path = 'gs://seqr-reference-data/GRCh37/MPC/fordist_constraint_official_mpc_values.vds'
    elif genome_version == "38":
        mpc_vds_path = 'gs://seqr-reference-data/GRCh38/MPC/fordist_constraint_official_mpc_values.liftover.GRCh38.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    mpc_vds = hail_context.read(mpc_vds_path).split_multi()

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> mpc summary: ")
        #print(mpc_vds.summarize())

    return vds.annotate_variants_vds(mpc_vds, expr=expr)
Exemple #3
0
def add_clinvar_from_vds(hail_context,
                         vds,
                         genome_version,
                         root="va.clinvar",
                         info_fields=CLINVAR_FIELDS,
                         verbose=True):
    """Add clinvar annotations to the vds"""

    if genome_version == "37":
        clinvar_single_vcf = 'gs://seqr-reference-data/GRCh37/clinvar/clinvar_alleles.single.b37.vcf.gz'
        clinvar_multi_vcf = 'gs://seqr-reference-data/GRCh37/clinvar/clinvar_alleles.multi.b37.vcf.gz'
    elif genome_version == "38":
        clinvar_single_vcf = 'gs://seqr-reference-data/GRCh38/clinvar/clinvar_alleles.single.b38.vcf.gz'
        clinvar_multi_vcf = 'gs://seqr-reference-data/GRCh38/clinvar/clinvar_alleles.multi.b38.vcf.gz'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    clinvar_vds = hail_context.import_vcf(
        [clinvar_single_vcf, clinvar_multi_vcf],
        force_bgz=True,
        min_partitions=1000)

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> clinvar vds summary: ")
        #print("\n" + str(clinvar_vds.summarize()))

    return vds.annotate_variants_vds(clinvar_vds, expr=expr)
Exemple #4
0
def add_cadd_to_vds(hail_context, vds, genome_version, root="va.cadd", info_fields=CADD_FIELDS, subset=None, verbose=True):
    """Add CADD scores to the vds"""

    if genome_version != "37" and genome_version != "38":
        raise ValueError("Invalid genome_version: " + str(genome_version))

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> cadd summary: ")
        #print("\n" + str(cadd_vds.summarize()))

    cadd_vds_path = "gs://seqr-reference-data/GRCh%(genome_version)s/CADD/CADD_snvs_and_indels.vds" % locals()

    logger.info("==> Reading in CADD: %s" % cadd_vds_path)
    cadd_vds = hail_context.read(cadd_vds_path)

    if subset:
        import hail
        cadd_vds = cadd_vds.filter_intervals(hail.Interval.parse(subset))

    vds = vds.annotate_variants_vds(cadd_vds, expr=expr)

    return vds
Exemple #5
0
def add_cadd_from_vds(hail_context, vds, genome_version, root="va.cadd", info_fields=CADD_FIELDS, verbose=True):
    """Add CADD scores to the vds"""

    if genome_version == "37":
        cadd_snvs_vds_path = 'gs://seqr-reference-data/GRCh37/CADD/whole_genome_SNVs.vds'
        cadd_indels_vds_path = 'gs://seqr-reference-data/GRCh37/CADD/InDels.vds'

    elif genome_version == "38":
        cadd_snvs_vds_path = 'gs://seqr-reference-data/GRCh38/CADD/whole_genome_SNVs.liftover.GRCh38.vds'
        cadd_indels_vds_path = 'gs://seqr-reference-data/GRCh38/CADD/InDels.liftover.GRCh38.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    #cadd_vds = hail_context.import_vcf([cadd_snvs_vcf_path, cadd_indels_vcf_path], force_bgz=True, min_partitions=1000)
    cadd_vds = hail_context.read([cadd_snvs_vds_path, cadd_indels_vds_path]).split_multi()

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> cadd summary: ")
        #print("\n" + str(cadd_vds.summarize()))

    return vds.annotate_variants_vds(cadd_vds, expr=expr)
def add_topmed_to_vds(hail_context,
                      vds,
                      genome_version,
                      root="va.topmed",
                      fields=TOPMED_FIELDS,
                      subset=None,
                      verbose=True):
    """Add 1000 genome AC and AF annotations to the vds"""

    if genome_version == "37":
        raise ValueError("Not yet available")
    elif genome_version == "38":
        topmed_vds_path = 'gs://seqr-reference-data/GRCh38/TopMed/ALL.TOPMed_freeze5_hg38_dbSNP.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    topmed_vds = hail_context.read(topmed_vds_path).split_multi()

    if subset:
        import hail
        topmed_vds = topmed_vds.filter_intervals(hail.Interval.parse(subset))

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(expr)
        #print("\n==> topmed summary: ")
        #print("\n" + str(topmed_vds.summarize()))

    return vds.annotate_variants_vds(topmed_vds, expr=expr)
Exemple #7
0
def add_1kg_phase3_from_vds(hail_context,
                            vds,
                            genome_version,
                            root="va.g1k",
                            fields=G1K_FIELDS,
                            verbose=True):
    """Add 1000 genome AC and AF annotations to the vds"""

    if genome_version == "37":
        g1k_vds_path = 'gs://seqr-reference-data/GRCh37/1kg/1kg.wgs.phase3.20130502.GRCh37_sites.vds'
    elif genome_version == "38":
        g1k_vds_path = 'gs://seqr-reference-data/GRCh38/1kg/1kg.wgs.phase3.20170504.GRCh38_sites.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    g1k_vds = hail_context.read(g1k_vds_path).split_multi()

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=fields,
        other_source_root="vds.g1k",
    )

    if verbose:
        print(expr)
        #print("\n==> 1kg summary: ")
        #print("\n" + str(g1k_vds.summarize()))

    return vds.annotate_variants_vds(g1k_vds, expr=expr)
Exemple #8
0
def add_dbnsfp_to_vds(hail_context,
                      vds,
                      genome_version,
                      root="va.dbnsfp",
                      subset=None,
                      verbose=True):
    """Add dbNSFP fields to the VDS"""

    if genome_version == "37":
        dbnsfp_path = "gs://seqr-reference-data/GRCh37/dbNSFP/v2.9.3/dbNSFP2.9.3_variant.vds"
        dbnsfp_schema = DBNSFP_SCHEMA_37
    elif genome_version == "38":
        dbnsfp_path = "gs://seqr-reference-data/GRCh38/dbNSFP/v3.5/dbNSFP3.5a_variant.vds"
        dbnsfp_schema = DBNSFP_SCHEMA_38
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    # create sites-only VDS
    dbnsfp_vds = hail_context.read(dbnsfp_path)

    if subset:
        import hail
        dbnsfp_vds = dbnsfp_vds.filter_intervals(hail.Interval.parse(subset))

    expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=dbnsfp_schema,
        other_source_root="vds",
    )

    if verbose:
        print(expr)

    return vds.annotate_variants_vds(dbnsfp_vds, expr=expr)
Exemple #9
0
def add_gnomad_to_vds(hail_context, vds, genome_version, exomes_or_genomes, root=None, top_level_fields=TOP_LEVEL_FIELDS, info_fields=INFO_FIELDS, subset=None, verbose=True):
    if genome_version not in ("37", "38"):
        raise ValueError("Invalid genome_version: %s. Must be '37' or '38'" % str(genome_version))

    if exomes_or_genomes not in ("exomes", "genomes"):
        raise ValueError("Invalid genome_version: %s. Must be 'exomes' or 'genomes'" % str(genome_version))

    if root is None:
        root = "va.gnomad_%s" % exomes_or_genomes

    gnomad_vds_path = GNOMAD_VDS_PATHS["%s_%s" % (exomes_or_genomes, genome_version)]

    gnomad_vds = hail_context.read(gnomad_vds_path).split_multi()

    if subset:
        import hail
        gnomad_vds = gnomad_vds.filter_intervals(hail.Interval.parse(subset))

    #if genome_version == "38":
        #info_fields += """
        #    OriginalContig: String,
        #    OriginalStart: String,
        #"""

    if exomes_or_genomes == "genomes":
        # remove any *SAS* fields from genomes since South Asian population only defined for exomes
        info_fields = "\n".join(field for field in info_fields.split("\n") if "SAS" not in field)

    top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=top_level_fields,
        other_source_root="vds",
    )
    if verbose:
        print(top_fields_expr)

    info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )
    if verbose:
        print(info_fields_expr)

    return (vds
        .annotate_variants_vds(gnomad_vds, expr=", ".join([top_fields_expr, info_fields_expr]))
    )
Exemple #10
0
def compute_minimal_schema(vds, analysis_type):

    # add computed annotations
    parallel_computed_annotation_exprs = [
        "va.docId = %s" % get_expr_for_variant_id(512),
    ]

    vds = vds.annotate_variants_expr(parallel_computed_annotation_exprs)

    #pprint(vds.variant_schema)

    # apply schema to dataset
    INPUT_SCHEMA = {}
    if analysis_type == "GATK_VARIANTS":
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
            wasSplit: Boolean,
            aIndex: Int,
        """

        INPUT_SCHEMA["info_fields"] = ""

    elif analysis_type in ["MANTA_SVS", "JULIA_SVS"]:
        INPUT_SCHEMA["top_level_fields"] = """
            docId: String,
        """

        INPUT_SCHEMA["info_fields"] = ""

    else:
        raise ValueError("Unexpected analysis_type: %s" % analysis_type)

    expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean",
                                                               **INPUT_SCHEMA)
    vds = vds.annotate_variants_expr(expr=expr)
    vds = vds.annotate_variants_expr("va = va.clean")

    return vds
Exemple #11
0
             AN: Int,
             --- BaseQRankSum: Double,
             --- ClippingRankSum: Double,
             DP: Int,
             FS: Double,
             InbreedingCoeff: Double,
             MQ: Double,
             --- MQRankSum: Double,
             QD: Double,
             --- ReadPosRankSum: Double,
             VQSLOD: Double,
             culprit: String,
        """
}

expr = convert_vds_schema_string_to_annotate_variants_expr(root="va.clean", **INPUT_SCHEMA)

vds = vds.annotate_variants_expr(expr=expr)
vds = vds.annotate_variants_expr("va = va.clean")


# add reference data
CLINVAR_INFO_FIELDS = """
    MEASURESET_TYPE: String,
    MEASURESET_ID: String,
    RCV: String,
    ALLELE_ID: String,
    CLINICAL_SIGNIFICANCE: String,
    PATHOGENIC: String,
    BENIGN: String,
    CONFLICTED: String,
def add_exac_to_vds(hail_context,
                    vds,
                    genome_version,
                    root="va.exac",
                    top_level_fields=TOP_LEVEL_FIELDS,
                    info_fields=INFO_FIELDS,
                    subset=None,
                    verbose=True):
    if genome_version == "37":
        exac_vds_path = 'gs://seqr-reference-data/GRCh37/gnomad/ExAC.r1.sites.vds'
    elif genome_version == "38":
        exac_vds_path = 'gs://seqr-reference-data/GRCh38/gnomad/ExAC.r1.sites.liftover.b38.vds'
    else:
        raise ValueError("Invalid genome_version: " + str(genome_version))

    #if genome_version == "38":
    #    info_fields += """
    #        OriginalContig: String,
    #        OriginalStart: String,
    #    """

    exac_vds = hail_context.read(exac_vds_path).split_multi()

    if subset:
        import hail
        exac_vds = exac_vds.filter_intervals(hail.Interval.parse(subset))

    # ExAC VCF doesn't contain AF fields, so compute them here
    exac_vds = exac_vds.annotate_variants_expr("""
          va.info.AF_AFR = if(va.info.AN_AFR > 0) va.info.AC_AFR[va.aIndex-1] / va.info.AN_AFR else NA:Float,
          va.info.AF_AMR = if(va.info.AN_AMR > 0) va.info.AC_AMR[va.aIndex-1] / va.info.AN_AMR else NA:Float,
          va.info.AF_EAS = if(va.info.AN_EAS > 0) va.info.AC_EAS[va.aIndex-1] / va.info.AN_EAS else NA:Float,
          va.info.AF_FIN = if(va.info.AN_FIN > 0) va.info.AC_FIN[va.aIndex-1] / va.info.AN_FIN else NA:Float,
          va.info.AF_NFE = if(va.info.AN_NFE > 0) va.info.AC_NFE[va.aIndex-1] / va.info.AN_NFE else NA:Float,
          va.info.AF_OTH = if(va.info.AN_OTH > 0) va.info.AC_OTH[va.aIndex-1] / va.info.AN_OTH else NA:Float,
          va.info.AF_SAS = if(va.info.AN_SAS > 0) va.info.AC_SAS[va.aIndex-1] / va.info.AN_SAS else NA:Float,
          va.info.AF_POPMAX = if(va.info.AN_POPMAX[va.aIndex-1] != "NA" && va.info.AN_POPMAX[va.aIndex-1].toInt() > 0) va.info.AC_POPMAX[va.aIndex-1].toInt() / va.info.AN_POPMAX[va.aIndex-1].toInt() else NA:Float
    """)

    top_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=top_level_fields,
        other_source_root="vds",
    )
    if verbose:
        print(top_fields_expr)

    info_fields_expr = convert_vds_schema_string_to_annotate_variants_expr(
        root=root,
        other_source_fields=info_fields,
        other_source_root="vds.info",
    )

    if verbose:
        print(info_fields_expr)

    vds = (vds.annotate_variants_vds(exac_vds,
                                     expr=", ".join(
                                         [top_fields_expr, info_fields_expr])))

    from pprint import pprint
    pprint(vds.variant_schema)

    return vds