Ejemplo n.º 1
0
            "gq_hist_all",
            "gq_hist_alt",
            "gnomad_age_hist_het",
            "gnomad_age_hist_hom",
        ]
    }
)

# Derived top level fields
ds = ds.annotate(
    alt=get_expr_for_alt_allele(ds),
    chrom=get_expr_for_contig(ds.locus),
    pos=ds.locus.position,
    ref=get_expr_for_ref_allele(ds),
    variant_id=get_expr_for_variant_id(ds),
    xpos=get_expr_for_xpos(ds.locus),
)

###########
# Subsets #
###########

all_subsets = ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]

# There is no separate non-cancer subset for genome data. All genome samples are non-cancer.
subsets = [s for s in all_subsets if f"{s}_AC_adj" in ds.row_value.dtype.fields]

fields_per_subpopulation = ["AC_adj", "AF_adj", "AN_adj", "nhomalt_adj"]

populations = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
Ejemplo n.º 2
0
            "gq_hist_all",
            "gq_hist_alt",
            "gnomad_age_hist_het",
            "gnomad_age_hist_hom",
        ]
    }
)

# Derived top level fields
ds = ds.annotate(
    alt=get_expr_for_alt_allele(ds),
    chrom=get_expr_for_contig(ds),
    pos=get_expr_for_start_pos(ds),
    ref=get_expr_for_ref_allele(ds),
    variant_id=get_expr_for_variant_id(ds),
    xpos=get_expr_for_xpos(ds),
)

###########
# Subsets #
###########

all_subsets = ["gnomad", "controls", "non_cancer", "non_neuro", "non_topmed"]

# There is no separate non-cancer subset for genome data. All genome samples are non-cancer.
subsets = [s for s in all_subsets if f"{s}_AC_adj" in ds.row_value.dtype.fields]

fields_per_subpopulation = ["AC_adj", "AF_adj", "AN_adj", "nhomalt_adj"]

populations = ["afr", "amr", "asj", "eas", "fin", "nfe", "oth", "sas"]
    **{
        f"main_transcript_{field}": mt.main_transcript[field]
        for field in mt.main_transcript.dtype.fields
    },
    pos=get_expr_for_start_pos(mt),
    ref=get_expr_for_ref_allele(mt),
    review_status=review_status_str,
    transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    transcript_ids=get_expr_for_vep_transcript_ids_set(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    transcript_id_to_consequence_json=
    get_expr_for_vep_transcript_id_to_consequence_map(
        vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
    variant_id=get_expr_for_variant_id(mt),
    xpos=get_expr_for_xpos(mt.locus),
)

print("\n=== Summary ===")
hl.summarize_variants(mt)

# Drop key columns for export
rows = mt.rows()
rows = rows.order_by(rows.variant_id).drop("locus", "alleles")

print("\n=== Exporting to Elasticsearch ===")
es = ElasticsearchClient(args.host, args.port)
es.export_table_to_elasticsearch(
    rows,
    index_name=index_name,
    index_type_name=args.index_type,
Ejemplo n.º 4
0
def import_mnv_file(path, **kwargs):
    column_types = {
        "AC_mnv_ex": hl.tint,
        "AC_mnv_gen": hl.tint,
        "AC_mnv": hl.tint,
        "AC_snp1_ex": hl.tint,
        "AC_snp1_gen": hl.tint,
        "AC_snp1": hl.tint,
        "AC_snp2_ex": hl.tint,
        "AC_snp2_gen": hl.tint,
        "AC_snp2": hl.tint,
        "AN_snp1_ex": hl.tfloat,
        "AN_snp1_gen": hl.tfloat,
        "AN_snp2_ex": hl.tfloat,
        "AN_snp2_gen": hl.tfloat,
        "categ": hl.tstr,
        "filter_snp1_ex": hl.tarray(hl.tstr),
        "filter_snp1_gen": hl.tarray(hl.tstr),
        "filter_snp2_ex": hl.tarray(hl.tstr),
        "filter_snp2_gen": hl.tarray(hl.tstr),
        "gene_id": hl.tstr,
        "gene_name": hl.tstr,
        "locus.contig": hl.tstr,
        "locus.position": hl.tint,
        "mnv_amino_acids": hl.tstr,
        "mnv_codons": hl.tstr,
        "mnv_consequence": hl.tstr,
        "mnv_lof": hl.tstr,
        "mnv": hl.tstr,
        "n_homhom_ex": hl.tint,
        "n_homhom_gen": hl.tint,
        "n_homhom": hl.tint,
        "n_indv_ex": hl.tint,
        "n_indv_gen": hl.tint,
        "n_indv": hl.tint,
        "snp1_amino_acids": hl.tstr,
        "snp1_codons": hl.tstr,
        "snp1_consequence": hl.tstr,
        "snp1_lof": hl.tstr,
        "snp1": hl.tstr,
        "snp2_amino_acids": hl.tstr,
        "snp2_codons": hl.tstr,
        "snp2_consequence": hl.tstr,
        "snp2_lof": hl.tstr,
        "snp2": hl.tstr,
        "transcript_id": hl.tstr,
    }

    ds = hl.import_table(path,
                         key="mnv",
                         missing="",
                         types=column_types,
                         **kwargs)

    ds = ds.transmute(locus=hl.locus(ds["locus.contig"], ds["locus.position"]))

    ds = ds.transmute(
        contig=get_expr_for_contig(ds.locus),
        pos=ds.locus.position,
        xpos=get_expr_for_xpos(ds.locus),
    )

    ds = ds.annotate(ref=ds.mnv.split("-")[2],
                     alt=ds.mnv.split("-")[3],
                     variant_id=ds.mnv)

    ds = ds.annotate(snp1_copy=ds.snp1, snp2_copy=ds.snp2)
    ds = ds.transmute(constituent_snvs=[
        hl.bind(
            lambda variant_id_parts: hl.struct(
                variant_id=ds[f"{snp}_copy"],
                chrom=variant_id_parts[0],
                pos=hl.int(variant_id_parts[1]),
                ref=variant_id_parts[2],
                alt=variant_id_parts[3],
                exome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_ex"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_ex"],
                        ac=ds[f"AC_{snp}_ex"],
                        an=hl.int(ds[f"AN_{snp}_ex"]),
                    ),
                ),
                genome=hl.or_missing(
                    hl.is_defined(ds[f"AN_{snp}_gen"]),
                    hl.struct(
                        filters=ds[f"filter_{snp}_gen"],
                        ac=ds[f"AC_{snp}_gen"],
                        an=hl.int(ds[f"AN_{snp}_gen"]),
                    ),
                ),
            ),
            ds[f"{snp}_copy"].split("-"),
        ) for snp in ["snp1", "snp2"]
    ])

    ds = ds.annotate(constituent_snv_ids=[ds.snp1, ds.snp2])

    ds = ds.annotate(
        mnv_in_exome=ds.constituent_snvs.all(lambda s: hl.is_defined(s.exome)),
        mnv_in_genome=ds.constituent_snvs.all(
            lambda s: hl.is_defined(s.genome)),
    )

    ds = ds.transmute(
        n_individuals=ds.n_indv,
        ac=ds.AC_mnv,
        ac_hom=ds.n_homhom,
        exome=hl.or_missing(
            ds.mnv_in_exome,
            hl.struct(n_individuals=ds.n_indv_ex,
                      ac=ds.AC_mnv_ex,
                      ac_hom=ds.n_homhom_ex),
        ),
        genome=hl.or_missing(
            ds.mnv_in_genome,
            hl.struct(n_individuals=ds.n_indv_gen,
                      ac=ds.AC_mnv_gen,
                      ac_hom=ds.n_homhom_gen),
        ),
    )

    ds = ds.drop("AC_snp1", "AC_snp2")

    ds = ds.transmute(consequence=hl.struct(
        category=ds.categ,
        gene_id=ds.gene_id,
        gene_name=ds.gene_name,
        transcript_id=ds.transcript_id,
        consequence=ds.mnv_consequence,
        codons=ds.mnv_codons,
        amino_acids=ds.mnv_amino_acids,
        lof=ds.mnv_lof,
        snv_consequences=[
            hl.struct(
                variant_id=ds[f"{snp}"],
                amino_acids=ds[f"{snp}_amino_acids"],
                codons=ds[f"{snp}_codons"],
                consequence=ds[f"{snp}_consequence"],
                lof=ds[f"{snp}_lof"],
            ) for snp in ["snp1", "snp2"]
        ],
    ))

    # Collapse table to one row per MNV, with all consequences for the MNV collected into an array
    consequences = ds.group_by(
        ds.mnv).aggregate(consequences=hl.agg.collect(ds.consequence))
    ds = ds.drop("consequence")
    ds = ds.distinct()
    ds = ds.join(consequences)

    # Sort consequences by severity
    ds = ds.annotate(consequences=hl.sorted(
        ds.consequences,
        key=lambda c: CONSEQUENCE_TERM_RANK_LOOKUP.get(c.consequence),
    ))

    ds = ds.annotate(changes_amino_acids_for_snvs=hl.literal([0, 1]).filter(
        lambda idx: ds.consequences.any(lambda csq: csq.snv_consequences[
            idx].amino_acids.lower() != csq.amino_acids.lower())).map(
                lambda idx: ds.constituent_snv_ids[idx]))

    return ds
    "region_name": hl.tstr,
}

ds = hl.import_table(args.input_url, missing="", types=column_types)

###########
# Prepare #
###########

ds = ds.annotate(
    start=hl.min(ds.genomic_start, ds.genomic_end),
    stop=hl.max(ds.genomic_start, ds.genomic_end),
)

ds = ds.annotate(
    xstart=get_expr_for_xpos(hl.locus(ds.chr, ds.start)),
    xstop=get_expr_for_xpos(hl.locus(ds.chr, ds.stop)),
)

ds = ds.drop("genomic_start", "genomic_end")

ds = ds.transmute(
    chrom=ds.chr, gene_name=ds.gene, transcript_id=ds.transcript.split("\.")[0]
)

ds = ds.drop("region_name")

#########
# Write #
#########
Ejemplo n.º 6
0
    ),
    alt=get_expr_for_alt_allele(mt),
    chrom=get_expr_for_contig(mt),
    pos=get_expr_for_start_pos(mt),
    ref=get_expr_for_ref_allele(mt),
    original_alt_alleles=get_expr_for_variant_ids(mt.old_locus, mt.old_alleles),
    sortedTranscriptConsequences=hl.bind(
        lambda genes_with_lc_lof_flag, genes_with_loftee_flag_flag: mt.sortedTranscriptConsequences.map(
            lambda csq: csq.annotate(
                flags=hl.struct(
                    lc_lof=get_expr_for_consequence_lc_lof_flag(csq),
                    lc_lof_in_gene=genes_with_lc_lof_flag.contains(csq.gene_id),
                    lof_flag=get_expr_for_consequence_loftee_flag_flag(csq),
                    lof_flag_in_gene=genes_with_loftee_flag_flag.contains(csq.gene_id),
                    nc_transcript=(csq.category == "lof") & (csq.lof == ""),
                )
            )
        ),
        get_expr_for_genes_with_lc_lof_flag(mt.sortedTranscriptConsequences),
        get_expr_for_genes_with_loftee_flag_flag(mt.sortedTranscriptConsequences),
    ),
    variant_id=get_expr_for_variant_id(mt),
    xpos=get_expr_for_xpos(mt),
)

# Drop key columns for export
rows = mt.rows()
rows = rows.order_by(rows.variant_id).drop("locus", "alleles")

rows.write(args.output_url)
Ejemplo n.º 7
0
)

p = argparse.ArgumentParser()
p.add_argument("--variant-results-url", required=True)
p.add_argument("--variant-annotations-url", required=True)
p.add_argument("--output-url", required=True)
args = p.parse_args()

hl.init(log="/tmp/hail.log")

variants = hl.read_table(args.variant_annotations_url)
variants = variants.annotate(
    variant_id=get_expr_for_variant_id(variants),
    chrom=get_expr_for_contig(variants.locus),
    pos=variants.locus.position,
    xpos=get_expr_for_xpos(variants.locus),
    alt=get_expr_for_alt_allele(variants),
    ref=get_expr_for_ref_allele(variants),
)
variants = variants.transmute(
    transcript_id=hl.delimit(variants.transcript_id, ","),
    hgvsc=hl.delimit(
        variants.hgvsc.keys().map(lambda k: k + ":" + variants.hgvsc[k]), ","),
    hgvsp=hl.delimit(
        variants.hgvsp.keys().map(lambda k: k + ":" + variants.hgvsp[k]), ","),
)
variants = variants.annotate(flags="PASS")
variants = variants.drop("v")

results = hl.read_table(args.variant_results_url)
results = results.annotate(analysis_group=results.analysis_group.lower().