Example #1
0
def add_filters_expr(
    filters: Dict[str, hl.expr.BooleanExpression],
    current_filters: hl.expr.SetExpression = None,
) -> hl.expr.SetExpression:
    """
    Create an expression to create or add filters.

    For each entry in the `filters` dictionary, if the value evaluates to `True`,
    then the key is added as a filter name.

    Current filters are kept if provided using `current_filters`

    :param filters: The filters and their expressions
    :param current_filters: The set of current filters
    :return: An expression that can be used to annotate the filters
    """
    if current_filters is None:
        current_filters = hl.empty_set(hl.tstr)

    return hl.fold(
        lambda x, y: x.union(y),
        current_filters,
        [
            hl.cond(filter_condition, hl.set([filter_name]),
                    hl.empty_set(hl.tstr))
            for filter_name, filter_condition in filters.items()
        ],
    )
Example #2
0
def test_complex_round_trips():
    assert_round_trip(hl.struct())
    assert_round_trip(hl.empty_array(hl.tint32))
    assert_round_trip(hl.empty_set(hl.tint32))
    assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32))
    assert_round_trip(hl.locus('1', 100))
    assert_round_trip(hl.struct(x=3))
    assert_round_trip(hl.set([3, 4, 5, 3]))
    assert_round_trip(hl.array([3, 4, 5]))
    assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'}))
    assert_round_trip(
        hl.struct(x=hl.dict({
            3: 'a',
            4: 'b',
            5: 'c'
        }),
                  y=hl.array([3, 4, 5]),
                  z=hl.set([3, 4, 5, 3])))
def pop_max_expr(
    freq: hl.expr.ArrayExpression,
    freq_meta: hl.expr.ArrayExpression,
    pops_to_exclude: Optional[Set[str]] = None,
) -> hl.expr.StructExpression:
    """

    Create an expression containing the frequency information about the population that has the highest AF in `freq_meta`.

    Populations specified in `pops_to_exclude` are excluded and only frequencies from adj populations are considered.

    This resulting struct contains the following fields:

        - AC: int32
        - AF: float64
        - AN: int32
        - homozygote_count: int32
        - pop: str

    :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count']
    :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq)
    :param pops_to_exclude: Set of populations to skip for popmax calcluation

    :return: Popmax struct
    """
    _pops_to_exclude = (hl.literal(pops_to_exclude) if pops_to_exclude
                        is not None else hl.empty_set(hl.tstr))

    # pylint: disable=invalid-unary-operand-type
    popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (hl.set(freq_meta[i].keys()) == {"group", "pop"})
        & (freq_meta[i]["group"] == "adj")
        & (~_pops_to_exclude.contains(freq_meta[i]["pop"])))
    freq_filtered = popmax_freq_indices.map(lambda i: freq[i].annotate(
        pop=freq_meta[i]["pop"])).filter(lambda f: f.AC > 0)

    sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True)
    return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])
Example #4
0
def prepare_genes(gencode_path, hgnc_path, reference_genome):
    genes = import_gencode(gencode_path, reference_genome)

    hgnc = import_hgnc(hgnc_path)
    genes = genes.annotate(**hgnc[genes.gene_id])
    # If a symbol was not present in HGNC data, use the symbol from GENCODE
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, genes.gencode_symbol))

    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.empty_set(hl.tstr).add(genes.symbol).add(
            genes.gencode_symbol).union(genes.previous_symbols).union(
                genes.alias_symbols).map(lambda s: s.upper()),
    )

    genes = genes.annotate(
        reference_genome=reference_genome,
        transcripts=genes.transcripts.map(
            lambda transcript: transcript.annotate(reference_genome=
                                                   reference_genome)),
    )

    return genes
def faf_expr(
    freq: hl.expr.ArrayExpression,
    freq_meta: hl.expr.ArrayExpression,
    locus: hl.expr.LocusExpression,
    pops_to_exclude: Optional[Set[str]] = None,
    faf_thresholds: List[float] = [0.95, 0.99],
) -> Tuple[hl.expr.ArrayExpression, List[Dict[str, str]]]:
    """
    Calculate the filtering allele frequency (FAF) for each threshold specified in `faf_thresholds`.

    See http://cardiodb.org/allelefrequencyapp/ for more information.

    The FAF is computed for each of the following population stratification if found in `freq_meta`:

        - All samples, with adj criteria
        - For each population, with adj criteria
        - For all sex/population on the non-PAR regions of sex chromosomes (will be missing on autosomes and PAR regions of sex chromosomes)

    Each of the FAF entry is a struct with one entry per threshold specified in `faf_thresholds` of type float64.

    This returns a tuple with two expressions:

        1. An array of FAF expressions as described above
        2. An array of dict containing the metadata for each of the array elements, in the same format as that produced by `annotate_freq`.

    :param freq: ArrayExpression of call stats structs (typically generated by hl.agg.call_stats)
    :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (typically generated using annotate_freq)
    :param locus: locus
    :param pops_to_exclude: Set of populations to exclude from faf calculation (typically bottlenecked or consanguineous populations)
    :param faf_thresholds: List of FAF thresholds to compute
    :return: (FAF expression, FAF metadata)
    """
    _pops_to_exclude = (hl.literal(pops_to_exclude) if pops_to_exclude
                        is not None else hl.empty_set(hl.tstr))

    # pylint: disable=invalid-unary-operand-type
    faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (freq_meta[i].get("group") == "adj")
        & ((freq_meta[i].size() == 1)
           | ((hl.set(freq_meta[i].keys()) == {"pop", "group"})
              & (~_pops_to_exclude.contains(freq_meta[i]["pop"])))))
    sex_faf_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (freq_meta[i].get("group") == "adj")
        & (freq_meta[i].contains("sex"))
        & ((freq_meta[i].size() == 2)
           | ((hl.set(freq_meta[i].keys()) == {"pop", "group", "sex"})
              & (~_pops_to_exclude.contains(freq_meta[i]["pop"])))))

    faf_expr = faf_freq_indices.map(lambda i: hl.struct(
        **{
            f"faf{str(threshold)[2:]}": hl.experimental.
            filtering_allele_frequency(freq[i].AC, freq[i].AN, threshold)
            for threshold in faf_thresholds
        }))

    faf_expr = faf_expr.extend(
        sex_faf_freq_indices.map(lambda i: hl.or_missing(
            ~locus.in_autosome_or_par(),
            hl.struct(
                **{
                    f"faf{str(threshold)[2:]}": hl.
                    experimental.filtering_allele_frequency(
                        freq[i].AC, freq[i].AN, threshold)
                    for threshold in faf_thresholds
                }),
        )))

    faf_meta = faf_freq_indices.extend(sex_faf_freq_indices).map(
        lambda i: freq_meta[i])
    return faf_expr, hl.eval(faf_meta)
Example #6
0
def get_expr_for_vep_sorted_transcript_consequences_array(
        vep_root,
        include_coding_annotations=True,
        omit_consequences=OMIT_CONSEQUENCE_TERMS):
    """Sort transcripts by 3 properties:

        1. coding > non-coding
        2. transcript consequence severity
        3. canonical > non-canonical

    so that the 1st array entry will be for the coding, most-severe, canonical transcript (assuming
    one exists).

    Also, for each transcript in the array, computes these additional fields:
        domains: converts Array[Struct] to string of comma-separated domain names
        hgvs: set to hgvsp is it exists, or else hgvsc. formats hgvsp for synonymous variants.
        major_consequence: set to most severe consequence for that transcript (
            VEP sometimes provides multiple consequences for a single transcript)
        major_consequence_rank: major_consequence rank based on VEP SO ontology (most severe = 1)
            (see http://www.ensembl.org/info/genome/variation/predicted_data.html)
        category: set to one of: "lof", "missense", "synonymous", "other" based on the value of major_consequence.

    Args:
        vep_root (StructExpression): root path of the VEP struct in the MT
        include_coding_annotations (bool): if True, fields relevant to protein-coding variants will be included
    """

    selected_annotations = [
        "biotype",
        "canonical",
        "cdna_start",
        "cdna_end",
        "codons",
        "gene_id",
        "gene_symbol",
        "hgvsc",
        "hgvsp",
        "transcript_id",
    ]

    if include_coding_annotations:
        selected_annotations.extend([
            "amino_acids",
            "lof",
            "lof_filter",
            "lof_flags",
            "lof_info",
            "polyphen_prediction",
            "protein_id",
            "protein_start",
            "sift_prediction",
        ])

    omit_consequence_terms = hl.set(
        omit_consequences) if omit_consequences else hl.empty_set(hl.tstr)

    result = hl.sorted(
        vep_root.transcript_consequences.map(lambda c: c.select(
            *selected_annotations,
            consequence_terms=c.consequence_terms.filter(
                lambda t: ~omit_consequence_terms.contains(t)),
            domains=c.domains.map(lambda domain: domain.db + ":" + domain.name
                                  ),
            major_consequence=hl.cond(
                c.consequence_terms.size() > 0,
                hl.sorted(c.consequence_terms,
                          key=lambda t: CONSEQUENCE_TERM_RANK_LOOKUP.get(t))[0
                                                                             ],
                hl.null(hl.tstr),
            ))).filter(lambda c: c.consequence_terms.size() > 0).
        map(lambda c: c.annotate(
            category=(hl.case().when(
                CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <=
                CONSEQUENCE_TERM_RANK_LOOKUP.get("frameshift_variant"),
                "lof",
            ).when(
                CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <=
                CONSEQUENCE_TERM_RANK_LOOKUP.get("missense_variant"),
                "missense",
            ).when(
                CONSEQUENCE_TERM_RANK_LOOKUP.get(c.major_consequence) <=
                CONSEQUENCE_TERM_RANK_LOOKUP.get("synonymous_variant"),
                "synonymous",
            ).default("other")),
            hgvs=get_expr_for_formatted_hgvs(c),
            major_consequence_rank=CONSEQUENCE_TERM_RANK_LOOKUP.get(
                c.major_consequence),
        )),
        lambda c: (hl.bind(
            lambda is_coding, is_most_severe, is_canonical: (hl.cond(
                is_coding,
                hl.cond(is_most_severe, hl.cond(is_canonical, 1, 2),
                        hl.cond(is_canonical, 3, 4)),
                hl.cond(is_most_severe, hl.cond(is_canonical, 5, 6),
                        hl.cond(is_canonical, 7, 8)),
            )),
            hl.or_else(c.biotype, "") == "protein_coding",
            hl.set(c.consequence_terms).contains(vep_root.
                                                 most_severe_consequence),
            hl.or_else(c.canonical, 0) == 1,
        )),
    )

    if not include_coding_annotations:
        # for non-coding variants, drop fields here that are hard to exclude in the above code
        result = result.map(lambda c: c.drop("domains", "hgvsp"))

    return hl.zip_with_index(result).map(lambda csq_with_index: csq_with_index[
        1].annotate(transcript_rank=csq_with_index[0]))
Example #7
0
 gnomad_svs_AF=hl.null('float'),
 pos=10000,
 filters=['LOW_CALL_RATE'],
 xpos=1000010000,
 cpx_intervals=NULL_INTERVALS,
 xstart=1000010000,
 xstop=1000017000,
 svType='DUP',
 transcriptConsequenceTerms=['DUP'],
 sv_type_detail=hl.null('str'),
 sortedTranscriptConsequences=[
     hl.struct(gene_symbol='OR4F5',
               gene_id='ENSG00000284662',
               predicted_consequence='NEAREST_TSS')
 ],
 geneIds=hl.empty_set(hl.dtype('str')),
 samples_no_call=EMPTY_STR_ARRAY,
 samples_num_alt_1=['SAMPLE-1', 'SAMPLE-2', 'SAMPLE-3'],
 samples_num_alt_2=EMPTY_STR_ARRAY,
 genotypes=[
     hl.struct(sample_id='SAMPLE-1',
               gq=999,
               num_alt=1,
               cn=3),
     hl.struct(sample_id='SAMPLE-2',
               gq=52,
               num_alt=1,
               cn=3),
     hl.struct(sample_id='SAMPLE-3',
               gq=19,
               num_alt=1,