Ejemplo n.º 1
0
def prepare_variant_results(results_url, annotations_url):
    variant_results = hl.read_table(results_url)

    # Get unique variants from results table
    variants = variant_results.group_by(variant_results.locus,
                                        variant_results.alleles).aggregate()

    # Select AC/AF numbers for the alternate allele
    variant_results = variant_results.annotate(
        ac_case=variant_results.ac_case[1],
        af_case=variant_results.af_case[1],
        ac_ctrl=variant_results.ac_ctrl[1],
        af_ctrl=variant_results.af_ctrl[1],
    )

    # Rename analysis groups to be Elasticsearch-friendly
    variant_results = variant_results.annotate(
        analysis_group=GROUP_NAMES[variant_results.analysis_group])

    # Annotate variants with a struct for each analysis group
    variants = variants.annotate(groups=hl.struct())
    analysis_groups = variant_results.aggregate(
        hl.agg.collect_as_set(variant_results.analysis_group))
    for group in analysis_groups:
        group_results = variant_results.filter(
            variant_results.analysis_group == group).drop(
                "analysis_group", "variant_id")
        variants = variants.annotate(groups=variants.groups.annotate(
            **{group: group_results[variants.locus, variants.alleles]}))

    # Merge variant annotations for canonical transcripts
    variant_annotations = hl.read_table(annotations_url)
    variant_annotations = variant_annotations.drop("variant_id")
    variant_annotations = variant_annotations.filter(
        variant_annotations.transcript_id ==
        variant_annotations.canonical_transcript_id)
    variants = variants.annotate(**variant_annotations[variants.locus,
                                                       variants.alleles])

    variants = variants.annotate(
        chrom=variants.locus.contig[3:],
        pos=variants.locus.position,
        xpos=x_position(variants.locus),
    )
    variants = variants.annotate(
        variant_id=variants.chrom + "-" + hl.str(variants.pos) + "-" +
        variants.alleles[0] + "-" + variants.alleles[1])

    return variants
def format_coverage_table(ds):
    ds = ds.select(
        chrom=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        mean=ds.mean,
        median=ds.median,
        over1=ds.over_1,
        over5=ds.over_5,
        over10=ds.over_10,
        over15=ds.over_15,
        over20=ds.over_20,
        over25=ds.over_25,
        over30=ds.over_30,
        over50=ds.over_50,
        over100=ds.over_100,
    )

    ds = ds.key_by().drop("locus")

    return ds
Ejemplo n.º 3
0
def format_variants_table(ds):

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ###############
    # Frequencies #
    ###############

    g = hl.eval(ds.globals)

    freq_index_tree = get_freq_index_tree(g.freq_index_dict)

    subsets = list(freq_index_tree.keys())

    ds = ds.annotate(
        **{
            subset: hl.struct(
                # Adjusted frequencies
                AC_adj=freq_expression(ds, "AC", freq_index_tree[subset]),
                AN_adj=freq_expression(ds, "AN", freq_index_tree[subset]),
                AF_adj=freq_expression(ds, "AF", freq_index_tree[subset]),
                nhomalt_adj=freq_expression(ds, "homozygote_count", freq_index_tree[subset]),
                # Raw frequencies
                AC_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC,
                AN_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AN,
                AF_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AF,
                nhomalt_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].homozygote_count,
                # Popmax
                popmax=ds.popmax[g.popmax_index_dict[subset]].pop,
                AC_popmax=ds.popmax[g.popmax_index_dict[subset]].AC,
                AN_popmax=ds.popmax[g.popmax_index_dict[subset]].AN,
                AF_popmax=ds.popmax[g.popmax_index_dict[subset]].AF,
                nhomalt_popmax=ds.popmax[g.popmax_index_dict[subset]].homozygote_count,
            )
            for subset in subsets
        }
    )

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_index_tree = collections.defaultdict(dict)
    for labels_combo, index in g.faf_index_dict.items():
        labels = labels_combo.split("_")
        # Subset labels contain an _, so rebuild those after splitting them
        if labels[0] == "non":
            labels = ["_".join(labels[0:2])] + labels[2:]

        if len(labels) == 2:
            [subset, pop] = labels
            faf_index_tree[subset][pop] = index
        else:
            assert len(labels) == 1
            subset = labels[0]
            faf_index_tree[subset]["total"] = index

    ds = ds.annotate(
        **{
            subset: ds[subset].annotate(
                faf95_adj=hl.struct(**{pop: ds.faf[index].faf95 for pop, index in faf_index_tree[subset].items()}),
                faf99_adj=hl.struct(**{pop: ds.faf[index].faf99 for pop, index in faf_index_tree[subset].items()}),
            )
            for subset in subsets
        }
    )

    ds = ds.drop("freq", "popmax", "faf")

    ##############
    # Histograms #
    ##############

    # Extract overall age distribution
    ds = ds.transmute(
        gnomad_age_hist_het=ds.age_hist_het[g.age_index_dict["gnomad"]],
        gnomad_age_hist_hom=ds.age_hist_hom[g.age_index_dict["gnomad"]],
    )

    # Convert lists of numbers in histograms into pipe delimited strings
    ds = ds.annotate(
        **{
            field: ds[field].annotate(
                bin_freq=hl.delimit(ds[field].bin_freq, "|"), bin_edges=hl.delimit(ds[field].bin_edges, "|")
            )
            for field in [
                "ab_hist_alt",
                "dp_hist_all",
                "dp_hist_alt",
                "gq_hist_all",
                "gq_hist_alt",
                "gnomad_age_hist_het",
                "gnomad_age_hist_hom",
            ]
        }
    )

    ###########################
    # Quality metrics / flags #
    ###########################

    # Use the same fields as the VCFs
    # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159
    ds = ds.transmute(
        BaseQRankSum=ds.allele_info.BaseQRankSum,
        ClippingRankSum=ds.allele_info.ClippingRankSum,
        DP=ds.allele_info.DP,
        FS=ds.info_FS,
        InbreedingCoeff=ds.info_InbreedingCoeff,
        MQ=ds.info_MQ,
        MQRankSum=ds.info_MQRankSum,
        QD=ds.info_QD,
        ReadPosRankSum=ds.info_ReadPosRankSum,
        rf_negative_label=ds.fail_hard_filters,
        rf_positive_label=ds.tp,
        rf_tp_probability=ds.rf_probability,
        SOR=ds.info_SOR,
        VQSLOD=ds.allele_info.VQSLOD,
        VQSR_culprit=ds.allele_info.culprit,
        VQSR_NEGATIVE_TRAIN_SITE=ds.info_NEGATIVE_TRAIN_SITE,
        VQSR_POSITIVE_TRAIN_SITE=ds.info_POSITIVE_TRAIN_SITE,
    )

    # These fields are left unaltered at the top level
    #
    # allele_type
    # decoy
    # has_star
    # lcr
    # n_alt_alleles
    # nonpar
    # pab_max
    # rf_label
    # rf_train
    # segdup
    # transmitted_singleton
    # variant_type
    # was_mixed

    # TODO: Remove this, leave these at top level
    ds = ds.transmute(
        allele_info=hl.struct(
            BaseQRankSum=ds.BaseQRankSum,
            ClippingRankSum=ds.ClippingRankSum,
            DP=ds.DP,
            FS=ds.FS,
            InbreedingCoeff=ds.InbreedingCoeff,
            MQ=ds.MQ,
            MQRankSum=ds.MQRankSum,
            QD=ds.QD,
            ReadPosRankSum=ds.ReadPosRankSum,
            SOR=ds.SOR,
            VQSLOD=ds.VQSLOD,
            VQSR_culprit=ds.VQSR_culprit,
            VQSR_NEGATIVE_TRAIN_SITE=ds.VQSR_NEGATIVE_TRAIN_SITE,
            VQSR_POSITIVE_TRAIN_SITE=ds.VQSR_POSITIVE_TRAIN_SITE,
        )
    )

    ###################
    # VEP annotations #
    ###################

    ds = ds.annotate(sortedTranscriptConsequences=sorted_transcript_consequences_v2(ds.vep))

    ds = ds.drop("vep")

    #########
    # Flags #
    #########

    # TODO: Leave these at the top level
    ds = ds.transmute(flags=hl.struct(lcr=ds.lcr, segdup=ds.segdup))

    # TODO: Remove this, these flags are calculated on the fly
    ds = ds.annotate(
        flags=ds.flags.annotate(
            lc_lof=get_expr_for_variant_lc_lof_flag(ds.sortedTranscriptConsequences),
            lof_flag=get_expr_for_variant_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
        sortedTranscriptConsequences=hl.bind(
            lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: ds.sortedTranscriptConsequences.map(
                lambda csq: csq.annotate(
                    flags=hl.struct(
                        lc_lof=get_expr_for_consequence_lc_lof_flag(csq),
                        lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id),
                        lof_flag=get_expr_for_consequence_loftee_flag_flag(csq),
                        lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id),
                        nc_transcript=(csq.category == "lof") & (csq.lof == ""),
                    )
                )
            ),
            get_expr_for_genes_with_lc_lof_flag(ds.sortedTranscriptConsequences),
            get_expr_for_genes_with_loftee_flag_flag(ds.sortedTranscriptConsequences),
        ),
    )

    #################
    # Unused fields #
    #################

    # These fields were not in the 2.1.1 browser Hail table

    ds = ds.drop(
        "adj_biallelic_rank",
        "adj_biallelic_singleton_rank",
        "adj_rank",
        "adj_singleton_rank",
        "biallelic_rank",
        "biallelic_singleton_rank",
        "info_DP",
        "mills",
        "n_nonref",
        "omni",
        "qd",
        "rank",
        "score",
        "singleton_rank",
        "singleton",
        "was_split",
    )

    # These two fields appear only in the genomes table
    if "_score" in ds.row_value.dtype.fields:
        ds = ds.drop("_score", "_singleton")

    ########
    # Keys #
    ########

    # Drop key fields
    ds = ds.key_by().drop("locus", "alleles")

    return ds
def import_mnv_file(path, **kwargs):
    column_types = {
        "AC_mnv_ex": hl.tint,
        "AC_mnv_gen": hl.tint,
        "AC_mnv": hl.tint,
        "AC_snp1_ex": hl.tint,
        "AC_snp1_gen": hl.tint,
        "AC_snp1": hl.tint,
        "AC_snp2_ex": hl.tint,
        "AC_snp2_gen": hl.tint,
        "AC_snp2": hl.tint,
        "AN_snp1_ex": hl.tfloat,
        "AN_snp1_gen": hl.tfloat,
        "AN_snp2_ex": hl.tfloat,
        "AN_snp2_gen": hl.tfloat,
        "categ": hl.tstr,
        "filter_snp1_ex": hl.tarray(hl.tstr),
        "filter_snp1_gen": hl.tarray(hl.tstr),
        "filter_snp2_ex": hl.tarray(hl.tstr),
        "filter_snp2_gen": hl.tarray(hl.tstr),
        "gene_id": hl.tstr,
        "gene_name": hl.tstr,
        "locus.contig": hl.tstr,
        "locus.position": hl.tint,
        "mnv_amino_acids": hl.tstr,
        "mnv_codons": hl.tstr,
        "mnv_consequence": hl.tstr,
        "mnv_lof": hl.tstr,
        "mnv": hl.tstr,
        "n_homhom_ex": hl.tint,
        "n_homhom_gen": hl.tint,
        "n_homhom": hl.tint,
        "n_indv_ex": hl.tint,
        "n_indv_gen": hl.tint,
        "n_indv": hl.tint,
        "snp1_amino_acids": hl.tstr,
        "snp1_codons": hl.tstr,
        "snp1_consequence": hl.tstr,
        "snp1_lof": hl.tstr,
        "snp1": hl.tstr,
        "snp2_amino_acids": hl.tstr,
        "snp2_codons": hl.tstr,
        "snp2_consequence": hl.tstr,
        "snp2_lof": hl.tstr,
        "snp2": hl.tstr,
        "transcript_id": hl.tstr,
    }

    ds = hl.import_table(path,
                         key="mnv",
                         missing="",
                         types=column_types,
                         **kwargs)

    ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"]))

    ds = ds.transmute(
        contig=normalized_contig(ds.locus),
        pos=ds.locus.position,
        xpos=x_position(ds.locus),
    )

    ds = ds.annotate(ref=ds.mnv.split("-")[2],
                     alt=ds.mnv.split("-")[3],
                     variant_id=ds.mnv)

    ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2)
    ds = ds.transmute(constituent_snvs=[
        hl.bind(
            lambda variant_id_parts: hl.struct(
                variant_id=ds[f"{snp}_copy"],
                chrom=variant_id_parts[0],
                pos=hl.int(variant_id_parts[1]),
                ref=variant_id_parts[2],
                alt=variant_id_parts[3],
                exome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_ex"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_ex"],
                        ac=ds[f"AC_{snp}_ex"],
                        an=hl.int(ds[f"AN_{snp}_ex"]),
                    ),
                ),
                genome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_gen"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_gen"],
                        ac=ds[f"AC_{snp}_gen"],
                        an=hl.int(ds[f"AN_{snp}_gen"]),
                    ),
                ),
            ),
            ds[f"{snp}_copy"].split("-"),
        ) for snp in ["snp1", "snp2"]
    ])

    ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2])

    ds = ds.annotate(
        mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)),
        mnv_in_genome=ds.constituent_snvs.all(
            lambda s: hl.is_defined(s.genome)),
    )

    ds = ds.transmute(
        n_individuals=ds.n_indv,
        ac=ds.AC_mnv,
        ac_hom=ds.n_homhom,
        exome=hl.or_missing(
            ds.mnv_in_exome,
            hl.struct(n_individuals=ds.n_indv_ex,
                      ac=ds.AC_mnv_ex,
                      ac_hom=ds.n_homhom_ex),
        ),
        genome=hl.or_missing(
            ds.mnv_in_genome,
            hl.struct(n_individuals=ds.n_indv_gen,
                      ac=ds.AC_mnv_gen,
                      ac_hom=ds.n_homhom_gen),
        ),
    )

    ds = ds.drop("AC_snp1", "AC_snp2")

    ds = ds.transmute(consequence=hl.struct(
        category=ds.categ,
        gene_id=ds.gene_id,
        gene_name=ds.gene_name,
        transcript_id=ds.transcript_id,
        consequence=ds.mnv_consequence,
        codons=ds.mnv_codons,
        amino_acids=ds.mnv_amino_acids,
        lof=ds.mnv_lof,
        snv_consequences=[
            hl.struct(
                variant_id=ds[f"{snp}"],
                amino_acids=ds[f"{snp}_amino_acids"],
                codons=ds[f"{snp}_codons"],
                consequence=ds[f"{snp}_consequence"],
                lof=ds[f"{snp}_lof"],
            ) for snp in ["snp1", "snp2"]
        ],
    ))

    # Collapse table to one row per MNV, with all consequences for the MNV collected into an array
    consequences = ds.group_by(
        ds.mnv).aggregate(consequences=hl.agg.collect(ds.consequence))
    ds = ds.drop("consequence")
    ds = ds.distinct()
    ds = ds.join(consequences)

    # Sort consequences by severity
    ds = ds.annotate(consequences=hl.sorted(
        ds.consequences,
        key=lambda c: consequence_term_rank(c.consequence),
    ))

    ds = ds.annotate(changes_amino_acids_for_snvs=hl.literal([0, 1]).filter(
        lambda idx: ds.consequences.any(lambda csq: csq.snv_consequences[
            idx].amino_acids.lower() != csq.amino_acids.lower())).map(
                lambda idx: ds.constituent_snv_ids[idx]))

    return ds
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("--results", required=True)
    parser.add_argument("--annotations", required=True)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    hl.init(log="/tmp/hail.log")

    variants = hl.read_table(args.annotations)
    variants = variants.annotate(
        variant_id=variant_id(variants.locus, variants.alleles),
        chrom=variants.locus.contig,
        pos=variants.locus.position,
        xpos=x_position(variants.locus),
        alt=variants.alleles[1],
        ref=variants.alleles[0],
    )

    variants = variants.transmute(
        transcript_id=hl.delimit(variants.transcript_id, ","),
        hgvsc=hl.delimit(
            variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]),
            ","),
        hgvsp=hl.delimit(
            variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]),
            ","),
    )

    variants = variants.annotate(
        csq_canonical=hl.case().when((variants.csq_canonical == "mis")
                                     & (variants.mpc >= 3), "mis3").
        when((variants.csq_canonical == "mis")
             & (variants.mpc >= 2), "mis2").default(variants.csq_canonical))

    variants = variants.annotate(flags="PASS")
    variants = variants.drop("v")

    results = hl.read_table(args.results)
    results = results.annotate(
        analysis_group=results.analysis_group.lower().replace(
            "[^a-z0-9]+", "_").replace("_+$", ""))
    results = results.drop("v")

    # Add n_denovos to AC_case
    results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) +
                               hl.or_else(results.n_denovos, 0))
    results = results.annotate(
        af_case=hl.cond(results.an_case == 0, 0, results.ac_case /
                        results.an_case))

    variants = variants.filter(hl.is_defined(results[variants.key]))

    analysis_groups = results.aggregate(
        hl.agg.collect_as_set(results.analysis_group))

    variants = variants.annotate(groups=hl.struct())
    for group in analysis_groups:
        group_results = results.filter(
            results.analysis_group == group).drop("analysis_group")
        variants = variants.annotate(groups=variants.groups.annotate(
            **{group: group_results[variants.key]}))

    # The latest (2019/04/15) SCHEMA dataset moved the source and in_analysis field from variant level to group level
    # in_analysis is the same for all groups within a variant, but source is not
    variants = variants.annotate(in_analysis=variants.groups.meta.in_analysis,
                                 source=variants.groups.meta.source)

    variants.write(args.output)
Ejemplo n.º 6
0
def format_clinvar_variants(ds):
    # There are some variants with only one entry in alleles, ignore them for now.
    # TODO: These could be displayed in the ClinVar track even though they will never match a gnomAD variant.
    ds = ds.filter(hl.len(ds.alleles) == 2)

    # When a cluster is started with hailctl dataproc start cluster_name --vep, the init script for the
    # selected version of VEP links the appropriate configuration file to /vep_data/vep-gcloud.json
    ds = hl.vep(ds, "file:///vep_data/vep-gcloud.json", name="vep", block_size=1000)
    ds = ds.annotate(sorted_transcript_consequences=sorted_transcript_consequences_v3(ds.vep))
    ds = ds.drop("vep")

    ds = ds.select(
        clinical_significance=hl.sorted(ds.info.CLNSIG, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        clinvar_variation_id=ds.rsid,
        gold_stars=get_gold_stars(ds.info.CLNREVSTAT),
        review_status=hl.sorted(ds.info.CLNREVSTAT, key=lambda s: s.replace("^_", "z")).map(
            lambda s: s.replace("^_", "")
        ),
        sorted_transcript_consequences=ds.sorted_transcript_consequences,
    )

    ds = ds.annotate(
        chrom=normalized_contig(ds.locus), variant_id=variant_id(ds.locus, ds.alleles), xpos=x_position(ds.locus)
    )

    return ds
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--input-url",
        help="URL of ExAC sites VCF",
        default=
        "gs://gnomad-public/legacy/exac_browser/ExAC.r1.sites.vep.vcf.gz")
    parser.add_argument("--output-url",
                        help="URL to write Hail table to",
                        required=True)
    parser.add_argument("--subset",
                        help="Filter variants to this chrom:start-end range")
    args = parser.parse_args()

    hl.init(log="/tmp/hail.log")

    print("\n=== Importing VCF ===")

    ds = hl.import_vcf(args.input_url,
                       force_bgz=True,
                       min_partitions=2000,
                       skip_invalid_loci=True).rows()

    if args.subset:
        print(f"\n=== Filtering to interval {args.subset} ===")
        subset_interval = hl.parse_locus_interval(args.subset)
        ds = ds.filter(subset_interval.contains(ds.locus))

    print("\n=== Splitting multiallelic variants ===")

    ds = hl.split_multi(ds)

    ds = ds.repartition(2000, shuffle=True)

    # Get value corresponding to the split variant
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.or_missing(hl.is_defined(ds.info[field]), ds.info[field][
                ds.a_index - 1])
            for field in PER_ALLELE_FIELDS
        }))

    # For DP_HIST and GQ_HIST, the first value in the array contains the histogram for all individuals,
    # which is the same in each alt allele's variant.
    ds = ds.annotate(info=ds.info.annotate(
        DP_HIST=hl.struct(all=ds.info.DP_HIST[0],
                          alt=ds.info.DP_HIST[ds.a_index]),
        GQ_HIST=hl.struct(all=ds.info.GQ_HIST[0],
                          alt=ds.info.GQ_HIST[ds.a_index]),
    ))

    ds = ds.cache()

    print("\n=== Munging data ===")

    # Convert "NA" and empty strings into null values
    # Convert fields in chunks to avoid "Method code too large" errors
    for i in range(0, len(SELECT_INFO_FIELDS), 10):
        ds = ds.annotate(info=ds.info.annotate(
            **{
                field: hl.or_missing(
                    hl.is_defined(ds.info[field]),
                    hl.bind(
                        lambda value: hl.cond(
                            (value == "") | (value == "NA"),
                            hl.null(ds.info[field].dtype), ds.info[field]),
                        hl.str(ds.info[field]),
                    ),
                )
                for field in SELECT_INFO_FIELDS[i:i + 10]
            }))

    # Convert field types
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.cond(ds.info[field] == "", hl.null(hl.tint),
                           hl.int(ds.info[field]))
            for field in CONVERT_TO_INT_FIELDS
        }))
    ds = ds.annotate(info=ds.info.annotate(
        **{
            field: hl.cond(ds.info[field] == "", hl.null(hl.tfloat),
                           hl.float(ds.info[field]))
            for field in CONVERT_TO_FLOAT_FIELDS
        }))

    # Format VEP annotations to mimic the output of hail.vep
    ds = ds.annotate(info=ds.info.annotate(CSQ=ds.info.CSQ.map(
        lambda s: s.replace("%3A", ":").replace("%3B", ";").replace(
            "%3D", "=").replace("%25", "%").replace("%2C", ","))))
    ds = ds.annotate(vep=hl.struct(
        transcript_consequences=ds.info.CSQ.map(lambda csq_str: hl.bind(
            lambda csq_values: hl.struct(
                **{
                    field: hl.cond(csq_values[index] == "", hl.null(hl.tstr),
                                   csq_values[index])
                    for index, field in enumerate(VEP_FIELDS)
                }),
            csq_str.split("\\|"),
        )).filter(lambda annotation: annotation.Feature.startswith("ENST")).
        filter(lambda annotation: hl.int(annotation.ALLELE_NUM) == ds.a_index).
        map(lambda annotation: annotation.select(
            amino_acids=annotation.Amino_acids,
            biotype=annotation.BIOTYPE,
            canonical=annotation.CANONICAL == "YES",
            # cDNA_position may contain either "start-end" or, when start == end, "start"
            cdna_start=split_position_start(annotation.cDNA_position),
            cdna_end=split_position_end(annotation.cDNA_position),
            codons=annotation.Codons,
            consequence_terms=annotation.Consequence.split("&"),
            distance=hl.int(annotation.DISTANCE),
            domains=hl.or_missing(
                hl.is_defined(annotation.DOMAINS),
                annotation.DOMAINS.split("&").map(lambda d: hl.struct(
                    db=d.split(":")[0], name=d.split(":")[1])),
            ),
            exon=annotation.EXON,
            gene_id=annotation.Gene,
            gene_symbol=annotation.SYMBOL,
            gene_symbol_source=annotation.SYMBOL_SOURCE,
            hgnc_id=annotation.HGNC_ID,
            hgvsc=annotation.HGVSc,
            hgvsp=annotation.HGVSp,
            lof=annotation.LoF,
            lof_filter=annotation.LoF_filter,
            lof_flags=annotation.LoF_flags,
            lof_info=annotation.LoF_info,
            # PolyPhen field contains "polyphen_prediction(polyphen_score)"
            polyphen_prediction=hl.or_missing(
                hl.is_defined(annotation.PolyPhen),
                annotation.PolyPhen.split("\\(")[0]),
            protein_id=annotation.ENSP,
            # Protein_position may contain either "start-end" or, when start == end, "start"
            protein_start=split_position_start(annotation.Protein_position),
            protein_end=split_position_end(annotation.Protein_position),
            # SIFT field contains "sift_prediction(sift_score)"
            sift_prediction=hl.or_missing(hl.is_defined(annotation.SIFT),
                                          annotation.SIFT.split("\\(")[0]),
            transcript_id=annotation.Feature,
        ))))

    ds = ds.annotate(vep=ds.vep.annotate(most_severe_consequence=hl.bind(
        lambda all_consequence_terms: hl.or_missing(
            all_consequence_terms.size() != 0,
            hl.sorted(all_consequence_terms, key=consequence_term_rank)[0]),
        ds.vep.transcript_consequences.flatmap(lambda c: c.consequence_terms),
    )))

    ds = ds.cache()

    print("\n=== Adding derived fields ===")

    ds = ds.annotate(
        sorted_transcript_consequences=sorted_transcript_consequences_v3(
            ds.vep))

    ds = ds.select(
        "filters",
        "qual",
        "rsid",
        "sorted_transcript_consequences",
        AC=ds.info.AC,
        AC_Adj=ds.info.AC_Adj,
        AC_Hemi=ds.info.AC_Hemi,
        AC_Hom=ds.info.AC_Hom,
        AF=ds.info.AF,
        AN=ds.info.AN,
        AN_Adj=ds.info.AN_Adj,
        BaseQRankSum=ds.info.BaseQRankSum,
        CCC=ds.info.CCC,
        ClippingRankSum=ds.info.ClippingRankSum,
        DB=ds.info.DB,
        DP=ds.info.DP,
        DS=ds.info.DS,
        END=ds.info.END,
        FS=ds.info.FS,
        GQ_MEAN=ds.info.GQ_MEAN,
        GQ_STDDEV=ds.info.GQ_STDDEV,
        HWP=ds.info.HWP,
        HaplotypeScore=ds.info.HaplotypeScore,
        InbreedingCoeff=ds.info.InbreedingCoeff,
        MLEAC=ds.info.MLEAC,
        MLEAF=ds.info.MLEAF,
        MQ=ds.info.MQ,
        MQ0=ds.info.MQ0,
        MQRankSum=ds.info.MQRankSum,
        NCC=ds.info.NCC,
        NEGATIVE_TRAIN_SITE=ds.info.NEGATIVE_TRAIN_SITE,
        POSITIVE_TRAIN_SITE=ds.info.POSITIVE_TRAIN_SITE,
        QD=ds.info.QD,
        ReadPosRankSum=ds.info.ReadPosRankSum,
        VQSLOD=ds.info.VQSLOD,
        culprit=ds.info.culprit,
        DP_HIST=ds.info.DP_HIST,
        GQ_HIST=ds.info.GQ_HIST,
        DOUBLETON_DIST=ds.info.DOUBLETON_DIST,
        AC_CONSANGUINEOUS=ds.info.AC_CONSANGUINEOUS,
        AN_CONSANGUINEOUS=ds.info.AN_CONSANGUINEOUS,
        Hom_CONSANGUINEOUS=ds.info.Hom_CONSANGUINEOUS,
        AGE_HISTOGRAM_HET=ds.info.AGE_HISTOGRAM_HET,
        AGE_HISTOGRAM_HOM=ds.info.AGE_HISTOGRAM_HOM,
        AC_POPMAX=ds.info.AC_POPMAX,
        AN_POPMAX=ds.info.AN_POPMAX,
        POPMAX=ds.info.POPMAX,
        K1_RUN=ds.info.K1_RUN,
        K2_RUN=ds.info.K2_RUN,
        K3_RUN=ds.info.K3_RUN,
        ESP_AF_POPMAX=ds.info.ESP_AF_POPMAX,
        ESP_AF_GLOBAL=ds.info.ESP_AF_GLOBAL,
        ESP_AC=ds.info.ESP_AC,
        KG_AF_POPMAX=ds.info.KG_AF_POPMAX,
        KG_AF_GLOBAL=ds.info.KG_AF_GLOBAL,
        KG_AC=ds.info.KG_AC,
        AC_FEMALE=ds.info.AC_FEMALE,
        AN_FEMALE=ds.info.AN_FEMALE,
        AC_MALE=ds.info.AC_MALE,
        AN_MALE=ds.info.AN_MALE,
        populations=hl.struct(
            **{
                pop_id: hl.struct(
                    AC=ds.info[f"AC_{pop_id}"],
                    AN=ds.info[f"AN_{pop_id}"],
                    hemi=ds.info[f"Hemi_{pop_id}"],
                    hom=ds.info[f"Hom_{pop_id}"],
                )
                for pop_id in
                ["AFR", "AMR", "EAS", "FIN", "NFE", "OTH", "SAS"]
            }),
        colocated_variants=hl.bind(
            lambda this_variant_id: variant_ids(ds.old_locus, ds.old_alleles).
            filter(lambda v_id: v_id != this_variant_id),
            variant_id(ds.locus, ds.alleles),
        ),
        variant_id=variant_id(ds.locus, ds.alleles),
        xpos=x_position(ds.locus),
    )

    print("\n=== Writing table ===")

    ds.write(args.output_url)
def prepare_variant_results(results_url, annotations_url):
    variant_annotations = hl.import_table(
        annotations_url,
        force_bgz=True,
        min_partitions=100,
        key="Variant ID",
        find_replace=(r"^([\dXY]+):(\d+):([ACTG]+):([ACTG]+)", "$1-$2-$3-$4"),
        missing="NA",
        types={
            "Variant ID": hl.tstr,
            "CADD": hl.tfloat,
            "Comment": hl.tstr,
            "Consequence (canonical)": hl.tstr,
            "Consequence (for analysis)": hl.tstr,
            "Consequence (worst)": hl.tstr,
            "Flags": hl.tstr,
            "Gene ID": hl.tstr,
            "Gene name": hl.tstr,
            "HGVSc (canonical)": hl.tstr,
            "HGVSc": hl.tstr,
            "HGVSp (canonical)": hl.tstr,
            "HGVSp": hl.tstr,
            "In analysis": hl.tbool,
            "MPC": hl.tfloat,
            "Polyphen": hl.tstr,
            "Source": hl.tstr,
            "Transcript ID (canonical)": hl.tstr,
            "Transcript ID(s)": hl.tstr,
        },
    )

    variant_annotations = variant_annotations.rename({
        "Variant ID":
        "variant_id",
        "CADD":
        "cadd",
        "Comment":
        "comment",
        "Consequence (canonical)":
        "csq_canonical",
        "Consequence (for analysis)":
        "csq_analysis",
        "Consequence (worst)":
        "csq_worst",
        "Flags":
        "flags",
        "Gene ID":
        "gene_id",
        "Gene name":
        "gene_name",
        "HGVSc (canonical)":
        "hgvsc_canonical",
        "HGVSc":
        "hgvsc",
        "HGVSp (canonical)":
        "hgvsp_canonical",
        "HGVSp":
        "hgvsp",
        "In analysis":
        "in_analysis",
        "MPC":
        "mpc",
        "Polyphen":
        "polyphen",
        "Source":
        "source",
        "Transcript ID (canonical)":
        "canonical_transcript_id",
        "Transcript ID(s)":
        "transcript_id",
    })

    variant_results = hl.import_table(
        results_url,
        force_bgz=True,
        min_partitions=100,
        key="Variant ID",
        find_replace=(r"^([\dXY]+):(\d+):([ACTG]+):([ACTG]+)", "$1-$2-$3-$4"),
        missing="NA",
        types={
            "Variant ID": hl.tstr,
            "AC case": hl.tint,
            "AC control": hl.tint,
            "AF case": hl.tfloat,
            "AF control": hl.tfloat,
            "AN case": hl.tint,
            "AN control": hl.tint,
            "Analysis group": hl.tstr,
            "Estimate": hl.tfloat,
            "I2": hl.tfloat,
            "N denovos": hl.tint,
            "P-value": hl.tfloat,
            "Qp": hl.tfloat,
            "SE": hl.tfloat,
        },
    )

    variant_results = variant_results.rename(
        {
            "Variant ID": "variant_id",
            "AC case": "ac_case",
            "AC control": "ac_ctrl",
            "AF case": "af_case",
            "AF control": "af_ctrl",
            "AN case": "an_case",
            "AN control": "an_ctrl",
            "Analysis group": "analysis_group",
            "Estimate": "est",
            "I2": "i2",
            "N denovos": "n_denovos",
            "P-value": "p",
            "Qp": "qp",
            "SE": "se",
        }, )

    # Rename "EE" analysis group to "DEE"
    variant_results = variant_results.annotate(
        analysis_group=hl.cond(variant_results.analysis_group == "EE", "DEE",
                               variant_results.analysis_group))

    variants = variant_annotations.annotate(groups=hl.struct())
    analysis_groups = variant_results.aggregate(
        hl.agg.collect_as_set(variant_results.analysis_group))
    for group in analysis_groups:
        group_results = variant_results.filter(
            variant_results.analysis_group == group).drop("analysis_group")
        variants = variants.annotate(groups=variants.groups.annotate(
            **{group: group_results[variants.variant_id]}))

    variants = variants.annotate(
        chrom=variants.variant_id.split("-")[0],
        pos=hl.int(variants.variant_id.split("-")[1]),
    )
    variants = variants.annotate(
        xpos=x_position(hl.locus(variants.chrom, variants.pos)))

    return variants
Ejemplo n.º 9
0
def prepare_variant_results(table_urls):
    annotations = None
    analysis_groups = []

    for annotations_table_url, results_table_url in table_urls:
        group_annotations = hl.import_table(
            annotations_table_url,
            force=True,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "in_analysis": hl.tbool,
                "gene_id": hl.tstr,
                "gene_name": hl.tstr,
                "transcript_id": hl.tstr,
                "hgvsc": hl.tstr,
                "hgvsp": hl.tstr,
                "csq_analysis": hl.tstr,
                "csq_worst": hl.tstr,
                "mpc": hl.tfloat,
                "polyphen": hl.tstr,
            },
        )

        group_results = hl.import_table(
            results_table_url,
            force=True,
            key="v",
            missing="NA",
            types={
                "v": hl.tstr,
                "analysis_group": hl.tstr,
                "ac_case": hl.tint,
                "an_case": hl.tstr,
                "af_case": hl.tstr,
                "ac_ctrl": hl.tint,
                "an_ctrl": hl.tstr,
                "af_ctrl": hl.tstr,
            },
        )

        groups_in_table = group_results.aggregate(
            hl.agg.collect_as_set(group_results.analysis_group))
        assert len(groups_in_table) == 1, groups_in_table
        group_name = groups_in_table.pop()
        analysis_groups.append(group_name)

        group_results = group_results.annotate(
            an_case=hl.int(group_results.an_case),
            af_case=hl.float(group_results.af_case),
            an_ctrl=hl.int(group_results.an_ctrl),
            af_ctrl=hl.float(group_results.af_ctrl),
            in_analysis=group_annotations[group_results.v].in_analysis,
        )

        group_results.drop("analysis_group").write(f"temp_{group_name}.ht")

        group_annotations = group_annotations.drop("in_analysis")

        if annotations is None:
            annotations = group_annotations
        else:
            annotations = annotations.union(group_annotations)

    annotations = annotations.distinct()

    annotations = annotations.annotate(
        filters="PASS",
        csq_analysis=hl.sorted(annotations.csq_analysis.split(","),
                               lambda c: consequence_term_rank(c))[0],
        csq_worst=hl.sorted(annotations.csq_worst.split(","),
                            lambda c: consequence_term_rank(c))[0],
        canonical_transcript_id=annotations.transcript_id,
        hgvsc_canonical=annotations.hgvsc,
        hgvsp_canonical=annotations.hgvsp,
    )

    annotations = annotations.annotate(
        locus=hl.locus(
            annotations.v.split(":")[0], hl.int(annotations.v.split(":")[1])),
        alleles=annotations.v.split(":")[2:4],
    )

    annotations = annotations.annotate(
        variant_id=variant_id(annotations.locus, annotations.alleles),
        chrom=annotations.locus.contig,
        pos=annotations.locus.position,
        xpos=x_position(annotations.locus),
        alt=annotations.alleles[1],
        ref=annotations.alleles[0],
    )

    annotations = annotations.drop("locus", "alleles")

    annotations = annotations.annotate(groups=hl.struct())
    for group_name in analysis_groups:
        results = hl.read_table(f"temp_{group_name}.ht")
        annotations = annotations.annotate(groups=annotations.groups.annotate(
            **{group_name: results[annotations.key]}))

    annotations = annotations.key_by().drop("v")

    return annotations
Ejemplo n.º 10
0
def format_variants_table(ds):

    g = hl.eval(ds.globals)

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles),
                     xpos=x_position(ds.locus))

    ds = ds.annotate(multiallelic_variants=variant_ids(
        ds.old_locus, ds.old_alleles).filter(lambda vid: vid != ds.variant_id))

    ###############
    # Frequencies #
    ###############

    freq_index_tree = get_freq_index_tree(g.freq_meta)
    ds = ds.annotate(freq=array_to_tree(ds.freq, freq_index_tree))

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_index_tree = get_faf_index_tree(g.faf_index_dict)
    ds = ds.annotate(faf=array_to_tree(
        ds.faf, faf_index_tree, lambda faf: faf.select("faf95", "faf99")))

    ##############
    # Histograms #
    ##############

    # Convert lists of numbers in histograms into pipe delimited strings
    ds = ds.annotate(
        **{
            field: ds[field].annotate(
                bin_freq=hl.delimit(ds[field].bin_freq, "|"),
                bin_edges=hl.delimit(ds[field].bin_edges, "|"))
            for field in [
                "ab_hist_alt", "dp_hist_all", "dp_hist_alt", "gq_hist_all",
                "gq_hist_alt"
            ]
        })

    ###########################
    # Quality metrics / flags #
    ###########################

    # These fields are nested under `info`
    #
    # AS_VQSLOD
    # culprit
    # DP
    # FS
    # InbreedingCoeff
    # MQ
    # MQ_DP
    # MQRankSum
    # NEGATIVE_TRAIN_SITE
    # POSITIVE_TRAIN_SITE
    # QD
    # QUALapprox
    # RAW_MQ
    # ReadPosRankSum
    # SB
    # SOR
    # VarDP

    # Remove NaN values
    ds = ds.annotate(info=ds.info.annotate(FS=nullify_nan(ds.info.FS),
                                           InbreedingCoeff=nullify_nan(
                                               ds.info.InbreedingCoeff),
                                           MQ=nullify_nan(ds.info.MQ)))

    ###################
    # VEP annotations #
    ###################

    ds = ds.annotate(
        sorted_transcript_consequences=sorted_transcript_consequences_v3(
            ds.vep))

    ds = ds.drop("vep")

    ################
    # Other fields #
    ################

    # These fields are left unaltered at the top level
    #
    # decoy
    # filters
    # info
    # lcr
    # nonpar
    # popmax
    # qual
    # rsid

    # Drop fields created by splitting multi-allelic variants
    # This information is captured in the multiallelic_variants derived field
    ds = ds.drop("a_index", "old_locus", "old_alleles", "was_split")

    # Internal only
    # TODO: Remove line, this field won't be in the final table
    if "project_max" in ds.row_value.dtype.fields:
        ds = ds.drop("project_max")

    return ds