Ejemplo n.º 1
0
    def test_agg_cols_explode(self):
        t = hl.utils.range_matrix_table(1, 10)

        tests = [(agg.explode(
            lambda elt: agg.collect(elt + 1).append(0),
            hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                    hl.empty_array(hl.tint32))), [9, 10, 10, 11, 0]),
                 (agg.explode(
                     lambda elt: agg.explode(
                         lambda elt2: agg.collect(elt2 + 1).append(0),
                         [elt, elt + 1]),
                     hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                             hl.empty_array(hl.tint32))),
                  [9, 10, 10, 11, 10, 11, 11, 12, 0]),
                 (agg.explode(
                     lambda elt: agg.filter(elt > 8,
                                            agg.collect(elt + 1).append(0)),
                     hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                             hl.empty_array(hl.tint32))), [10, 10, 11, 0]),
                 (agg.explode(
                     lambda elt: agg.group_by(elt % 3,
                                              agg.collect(elt + 1).append(0)),
                     hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                             hl.empty_array(hl.tint32))), {
                                 0: [10, 10, 0],
                                 1: [11, 0],
                                 2: [9, 0]
                             })]
        for aggregation, expected in tests:
            self.assertEqual(
                t.select_rows(result=aggregation).result.collect()[0],
                expected)
Ejemplo n.º 2
0
 def test_agg_cols_group_by(self):
     t = hl.utils.range_matrix_table(1, 10)
     tests = [
         (agg.group_by(
             t.col_idx % 2,
             hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)), {
                 0: [1, 3, 5, 7, 9, 0],
                 1: [2, 4, 6, 8, 10, 0]
             }),
         (agg.group_by(
             t.col_idx % 3,
             agg.filter(
                 t.col_idx > 7,
                 hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))), {
                     0: [10, 0],
                     1: [0],
                     2: [9, 0]
                 }),
         (agg.group_by(
             t.col_idx % 3,
             agg.explode(
                 lambda elt: agg.collect(elt + 1).append(0),
                 hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1],
                         hl.empty_array(hl.tint32)))), {
                             0: [10, 11, 0],
                             1: [0],
                             2: [9, 10, 0]
                         }),
     ]
     for aggregation, expected in tests:
         self.assertEqual(
             t.select_rows(result=aggregation).result.collect()[0],
             expected)
Ejemplo n.º 3
0
def add_strand_flip_annotation(reference_ref, reference_alt, ds_a1, ds_a2):
    """ Document me here :)
    """
    is_strand_ambig = hl.is_strand_ambiguous(ds_a1, ds_a2)
    ds_a1_flipped = flip_strand(ds_a1)
    ds_a2_flipped = flip_strand(ds_a2)
    is_snp = hl.is_snp(ds_a1, ds_a2)
    null = hl.null(hl.tbool)

    return (hl.case().when(
        (ds_a1 == reference_alt) & (ds_a2 == reference_ref),
        hl.cond(is_strand_ambig, [
            hl.struct(swap=True, flip=True),
            hl.struct(swap=False, flip=False)
        ], [hl.struct(swap=False, flip=False)])).when(
            (ds_a1 == reference_ref) & (ds_a2 == reference_alt),
            hl.cond(is_strand_ambig, [
                hl.struct(swap=True, flip=False),
                hl.struct(swap=False, flip=True)
            ], [hl.struct(swap=True, flip=False)])).when(
                (ds_a1_flipped == reference_alt) &
                (ds_a2_flipped == reference_ref) & is_snp,
                [hl.struct(swap=False, flip=True)]).when(
                    (ds_a1_flipped == reference_ref) &
                    (ds_a2_flipped == reference_alt) & is_snp,
                    [hl.struct(swap=True, flip=True)]).default(
                        hl.empty_array(hl.tstruct(swap=hl.tbool,
                                                  flip=hl.tbool))))
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base-level-pext",
        help="Path to Hail table with base-level data",
        default=
        "gs://gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.ht",
    )
    parser.add_argument(
        "--low-max-pext-genes",
        help="Path to table containing list of genes with low max pext",
        default=
        "gs://gnomad-public/papers/2019-tx-annotation/data/GRCH37_hg19/max_pext_low_genes.021520.tsv",
    )
    parser.add_argument(
        "output_path", help="Path to output Hail table with region-level data")
    args = parser.parse_args()

    ds = prepare_pext_data(args.base_level_pext)

    low_max_pext_genes = hl.import_table(args.low_max_pext_genes)
    low_max_pext_genes = low_max_pext_genes.aggregate(
        hl.agg.collect_as_set(low_max_pext_genes.ensg))
    ds = ds.annotate(flags=hl.cond(
        hl.set(low_max_pext_genes).contains(ds.gene_id),
        hl.literal(["low_max_pext"]),
        hl.empty_array(hl.tstr),
    ))

    ds.write(args.output_path)
Ejemplo n.º 5
0
    def test_explode_cols(self):
        mt = hl.utils.range_matrix_table(4, 4)
        mt = mt.annotate_entries(e=mt.row_idx * 10 + mt.col_idx)

        self.assertTrue(mt.annotate_cols(x=[1]).explode_cols('x').drop('x')._same(mt))

        self.assertEqual(mt.annotate_cols(x=hl.empty_array('int')).explode_cols('x').count_cols(), 0)
        self.assertEqual(mt.annotate_cols(x=hl.null('array<int>')).explode_cols('x').count_cols(), 0)
        self.assertEqual(mt.annotate_cols(x=hl.range(0, mt.col_idx)).explode_cols('x').count_cols(), 6)
Ejemplo n.º 6
0
    def test_explode_cols(self):
        mt = hl.utils.range_matrix_table(4, 4)
        mt = mt.annotate_entries(e=mt.row_idx * 10 + mt.col_idx)

        self.assertTrue(mt.annotate_cols(x=[1]).explode_cols('x').drop('x')._same(mt))

        self.assertEqual(mt.annotate_cols(x=hl.empty_array('int')).explode_cols('x').count_cols(), 0)
        self.assertEqual(mt.annotate_cols(x=hl.null('array<int>')).explode_cols('x').count_cols(), 0)
        self.assertEqual(mt.annotate_cols(x=hl.range(0, mt.col_idx)).explode_cols('x').count_cols(), 6)
Ejemplo n.º 7
0
 def test_agg_explode(self):
     t = hl.Table.parallelize([
         hl.struct(a=[1, 2]),
         hl.struct(a=hl.empty_array(hl.tint32)),
         hl.struct(a=hl.null(hl.tarray(hl.tint32))),
         hl.struct(a=[3]),
         hl.struct(a=[hl.null(hl.tint32)])
     ])
     self.assertCountEqual(t.aggregate(hl.agg.collect(hl.agg.explode(t.a))),
                           [1, 2, None, 3])
Ejemplo n.º 8
0
 def test_agg_explode(self):
     t = hl.Table.parallelize([
         hl.struct(a=[1, 2]),
         hl.struct(a=hl.empty_array(hl.tint32)),
         hl.struct(a=hl.null(hl.tarray(hl.tint32))),
         hl.struct(a=[3]),
         hl.struct(a=[hl.null(hl.tint32)])
     ])
     self.assertCountEqual(t.aggregate(hl.agg.explode(lambda elt: hl.agg.collect(elt), t.a)),
                           [1, 2, None, 3])
def prepare_gene_models():
    genes_grch37 = prepare_gene_models_helper("GRCh37")
    genes_grch38 = prepare_gene_models_helper("GRCh38")

    genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value)
    genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value)

    genes = genes_grch37.join(genes_grch38, how="outer")

    # Annotate genes with information from HGNC
    hgnc_path = pipeline_config.get("reference_data", "hgnc_path")
    hgnc = load_hgnc(hgnc_path)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr)))
            .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr)))
            .append(genes.GRCh38.gencode_gene_symbol)
            .append(genes.GRCh37.gencode_gene_symbol)
            .filter(hl.is_defined)
            .map(lambda s: s.upper())
        ),
    )

    gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path")
    gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path)
    genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id])

    exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path")
    exac_constraint = prepare_exac_constraint(exac_constraint_path)
    genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id])

    staging_path = pipeline_config.get("output", "staging_path")

    genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
Ejemplo n.º 10
0
def prepare_pext_data(base_level_pext_path, low_max_pext_genes_path):
    ds = prepare_base_level_pext(base_level_pext_path)

    low_max_pext_genes = hl.import_table(low_max_pext_genes_path)
    low_max_pext_genes = low_max_pext_genes.aggregate(
        hl.agg.collect_as_set(low_max_pext_genes.ensg))
    ds = ds.annotate(flags=hl.if_else(
        hl.set(low_max_pext_genes).contains(ds.gene_id),
        hl.literal(["low_max_pext"]),
        hl.empty_array(hl.tstr),
    ))

    return ds
Ejemplo n.º 11
0
    def test_agg_cols_explode(self):
        t = hl.utils.range_matrix_table(1, 10)

        tests = [(agg.explode(lambda elt: agg.collect(elt + 1).append(0),
                              hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))),
                  [9, 10, 10, 11, 0]),
                 (agg.explode(lambda elt: agg.explode(lambda elt2: agg.collect(elt2 + 1).append(0),
                                                      [elt, elt + 1]),
                              hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))),
                  [9, 10, 10, 11, 10, 11, 11, 12, 0]),
                 (agg.explode(lambda elt: agg.filter(elt > 8,
                                                     agg.collect(elt + 1).append(0)),
                              hl.cond(t.col_idx > 7, [t.col_idx, t.col_idx + 1], hl.empty_array(hl.tint32))),
                  [10, 10, 11, 0]),
                 (agg.explode(lambda elt: agg.group_by(elt % 3,
                                                       agg.collect(elt + 1).append(0)),
                                           hl.cond(t.col_idx > 7,
                                                   [t.col_idx, t.col_idx + 1],
                                                   hl.empty_array(hl.tint32))),
                  {0: [10, 10, 0], 1: [11, 0], 2:[9, 0]})
                 ]
        for aggregation, expected in tests:
            self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
Ejemplo n.º 12
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("gencode")
    parser.add_argument("canonical_transcripts")
    parser.add_argument("hgnc")
    parser.add_argument("--min-partitions", type=int, default=8)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    # Load genes from GTF file
    genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions)
    genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol)

    # Annotate genes with canonical transcript
    canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions)
    genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id)

    # Drop transcripts except for canonical
    genes = genes.annotate(
        canonical_transcript=genes.transcripts.filter(
            lambda transcript: transcript.transcript_id == genes.canonical_transcript_id
        ).head()
    )
    genes = genes.drop("transcripts")

    # Annotate genes with information from HGNC
    hgnc = load_hgnc(args.hgnc)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr)))
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol),
        symbol_source=hl.or_else(genes.symbol_source, "gencode"),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(genes.previous_symbols)
            .extend(genes.alias_symbols)
            .append(genes.gencode_gene_symbol)
            .map(lambda s: s.upper())
        ),
    )

    genes.describe()

    genes.write(args.output, overwrite=True)
Ejemplo n.º 13
0
def test_complex_round_trips():
    assert_round_trip(hl.struct())
    assert_round_trip(hl.empty_array(hl.tint32))
    assert_round_trip(hl.empty_set(hl.tint32))
    assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32))
    assert_round_trip(hl.locus('1', 100))
    assert_round_trip(hl.struct(x=3))
    assert_round_trip(hl.set([3, 4, 5, 3]))
    assert_round_trip(hl.array([3, 4, 5]))
    assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'}))
    assert_round_trip(
        hl.struct(x=hl.dict({
            3: 'a',
            4: 'b',
            5: 'c'
        }),
                  y=hl.array([3, 4, 5]),
                  z=hl.set([3, 4, 5, 3])))
Ejemplo n.º 14
0
 def test_agg_cols_group_by(self):
     t = hl.utils.range_matrix_table(1, 10)
     tests = [(agg.group_by(t.col_idx % 2,
                            hl.array(agg.collect_as_set(t.col_idx + 1)).append(0)),
               {0: [1, 3, 5, 7, 9, 0], 1: [2, 4, 6, 8, 10, 0]}),
              (agg.group_by(t.col_idx % 3,
                            agg.filter(t.col_idx > 7,
                                       hl.array(agg.collect_as_set(t.col_idx + 1)).append(0))),
               {0: [10, 0], 1: [0], 2: [9, 0]}),
              (agg.group_by(t.col_idx % 3,
                            agg.explode(lambda elt: agg.collect(elt + 1).append(0),
                                        hl.cond(t.col_idx > 7,
                                                [t.col_idx, t.col_idx + 1],
                                                hl.empty_array(hl.tint32)))),
               {0: [10, 11, 0], 1: [0], 2:[9, 10, 0]}),
              ]
     for aggregation, expected in tests:
         self.assertEqual(t.select_rows(result = aggregation).result.collect()[0], expected)
    mnvs = mnvs.annotate(related_mnvs=mnvs.related_mnvs.map(
        lambda related_mnv: related_mnv.select(
            "combined_variant_id",
            "n_individuals",
            "other_constituent_snvs",
            changes_amino_acids=hl.bind(
                lambda mnv_consequences, related_mnv_consequences:
                mnv_consequences.key_set(
                ).union(related_mnv_consequences.key_set()
                        ).any(lambda gene_id: mnv_consequences.get(gene_id) !=
                              related_mnv_consequences.get(gene_id)),
                hl.dict(
                    mnvs.consequences.map(lambda c:
                                          (c.gene_id, c.amino_acids.lower()))),
                hl.dict(
                    related_mnv.consequences.map(lambda c: (
                        c.gene_id, c.amino_acids.lower()))),
            ),
        )))

    mnvs_3bp = mnvs_3bp.annotate(
        related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type))

    mnvs = mnvs.union(mnvs_3bp)

mnvs = mnvs.repartition(8, shuffle=True)

mnvs = mnvs.key_by()

mnvs.write(args.output_url)
Ejemplo n.º 16
0
def combine_datasets(dataset_ids):
    gene_models_path = f"{pipeline_config.get('output', 'staging_path')}/gene_models.ht"
    ds = hl.read_table(gene_models_path)

    ds = ds.annotate(gene_results=hl.struct(), variants=hl.struct())
    ds = ds.annotate_globals(
        meta=hl.struct(variant_fields=VARIANT_FIELDS, datasets=hl.struct()))

    for dataset_id in dataset_ids:
        dataset_path = os.path.join(
            pipeline_config.get("output", "staging_path"), dataset_id.lower())
        gene_results = hl.read_table(
            os.path.join(dataset_path, "gene_results.ht"))

        gene_group_result_field_names = gene_results.group_results.dtype.value_type.fields
        gene_group_result_field_types = [
            str(typ).rstrip("3264")
            for typ in gene_results.group_results.dtype.value_type.types
        ]
        gene_result_analysis_groups = list(
            gene_results.aggregate(
                hl.agg.explode(hl.agg.collect_as_set,
                               gene_results.group_results.keys())))

        gene_results = gene_results.annotate(group_results=hl.array([
            hl.tuple([
                gene_results.group_results.get(group)[field]
                for field in gene_group_result_field_names
            ]) for group in gene_result_analysis_groups
        ]))

        ds = ds.annotate(gene_results=ds.gene_results.annotate(
            **{dataset_id: gene_results[ds.gene_id]}))

        variant_results = hl.read_table(
            os.path.join(dataset_path, "variant_results.ht"))

        reference_genome = variant_results.locus.dtype.reference_genome.name
        variant_info_field_names = variant_results.info.dtype.fields
        variant_info_field_types = [
            str(typ).rstrip("3264") for typ in variant_results.info.dtype.types
        ]
        variant_group_result_field_names = variant_results.group_results.dtype.value_type.fields
        variant_group_result_field_types = [
            str(typ).rstrip("3264")
            for typ in variant_results.group_results.dtype.value_type.types
        ]
        variant_result_analysis_groups = list(
            variant_results.aggregate(
                hl.agg.explode(hl.agg.collect_as_set,
                               variant_results.group_results.keys())))

        variant_results = variant_results.annotate(
            info=hl.tuple([
                variant_results.info[field]
                for field in variant_info_field_names
            ]),
            group_results=hl.array([
                hl.rbind(
                    variant_results.group_results.get(group),
                    lambda group_result: hl.or_missing(
                        hl.is_defined(group_result),
                        hl.tuple([
                            group_result[field]
                            for field in variant_group_result_field_names
                        ]),
                    ),
                ) for group in variant_result_analysis_groups
            ]),
        )

        variant_results = variant_results.annotate(
            variant_id=variant_results.locus.contig.replace("^chr", "") + "-" +
            hl.str(variant_results.locus.position) + "-" +
            variant_results.alleles[0] + "-" + variant_results.alleles[1],
            pos=variant_results.locus.position,
        )

        variant_results = variant_results.annotate(variant=hl.tuple(
            [variant_results[field] for field in VARIANT_FIELDS]))
        variant_results = variant_results.group_by("gene_id").aggregate(
            variants=hl.agg.collect(variant_results.variant))
        ds = ds.annotate(variants=ds.variants.annotate(
            **{
                dataset_id:
                hl.or_else(
                    variant_results[ds.gene_id].variants,
                    hl.empty_array(
                        variant_results.variants.dtype.element_type),
                )
            }))

        ds = ds.annotate_globals(meta=ds.globals.meta.annotate(
            datasets=ds.globals.meta.datasets.annotate(
                **{
                    dataset_id:
                    hl.struct(
                        reference_genome=reference_genome,
                        gene_result_analysis_groups=gene_result_analysis_groups
                        or hl.empty_array(hl.tstr),
                        gene_group_result_field_names=
                        gene_group_result_field_names
                        or hl.empty_array(hl.tstr),
                        gene_group_result_field_types=
                        gene_group_result_field_types
                        or hl.empty_array(hl.tstr),
                        variant_info_field_names=variant_info_field_names
                        or hl.empty_array(hl.tstr),
                        variant_info_field_types=variant_info_field_types
                        or hl.empty_array(hl.tstr),
                        variant_result_analysis_groups=
                        variant_result_analysis_groups
                        or hl.empty_array(hl.tstr),
                        variant_group_result_field_names=
                        variant_group_result_field_names
                        or hl.empty_array(hl.tstr),
                        variant_group_result_field_types=
                        variant_group_result_field_types
                        or hl.empty_array(hl.tstr),
                    ),
                })))

    return ds
Ejemplo n.º 17
0
def prepare_gnomad_v2_variants_helper(path, exome_or_genome):
    ds = hl.read_table(path)

    ###############
    # Frequencies #
    ###############

    g = hl.eval(ds.globals)

    subsets = ["gnomad", "controls", "non_neuro", "non_topmed"] + (["non_cancer"] if exome_or_genome == "exome" else [])

    ds = ds.select_globals()

    ds = ds.annotate(
        freq=hl.struct(
            **{
                subset: hl.struct(
                    ac=ds.freq[g.freq_index_dict[subset]].AC,
                    ac_raw=ds.freq[g.freq_index_dict[f"{subset}_raw"]].AC,
                    an=ds.freq[g.freq_index_dict[subset]].AN,
                    hemizygote_count=hl.if_else(ds.nonpar, ds.freq[g.freq_index_dict[f"{subset}_male"]].AC, 0),
                    homozygote_count=ds.freq[g.freq_index_dict[subset]].homozygote_count,
                    populations=population_frequencies_expression(ds, g.freq_index_dict, subset),
                )
                for subset in subsets
            }
        )
    )

    # If a variant is not present in a subset, do not store population frequencies for that subset
    ds = ds.annotate(
        freq=ds.freq.annotate(
            **{
                subset: ds.freq[subset].annotate(
                    populations=hl.if_else(
                        ds.freq[subset].ac_raw == 0,
                        hl.empty_array(ds.freq[subset].populations.dtype.element_type),
                        ds.freq[subset].populations,
                    )
                )
                for subset in subsets
            }
        )
    )

    ###########################################
    # Subsets in which the variant is present #
    ###########################################

    ds = ds.annotate(
        subsets=hl.set(
            hl.array([(subset, ds.freq[subset].ac_raw > 0) for subset in subsets])
            .filter(lambda t: t[1])
            .map(lambda t: t[0])
        )
    )

    if exome_or_genome == "genome":
        ds = ds.annotate(subsets=ds.subsets.add("non_cancer"))

    ##############################
    # Filtering allele frequency #
    ##############################

    ds = ds.annotate(
        freq=ds.freq.annotate(
            **{
                subset: ds.freq[subset].annotate(
                    faf95=hl.rbind(
                        hl.sorted(
                            hl.array(
                                [
                                    hl.struct(
                                        faf=ds.faf[g.faf_index_dict[f"{subset}_{pop_id}"]].faf95, population=pop_id,
                                    )
                                    for pop_id in (
                                        ["afr", "amr", "eas", "nfe"] + (["sas"] if exome_or_genome == "exome" else [])
                                    )
                                ]
                            ).filter(lambda f: f.faf > 0),
                            key=lambda f: (-f.faf, f.population),
                        ),
                        lambda fafs: hl.if_else(
                            hl.len(fafs) > 0,
                            hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population,),
                            hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr),),
                        ),
                    ),
                    faf99=hl.rbind(
                        hl.sorted(
                            hl.array(
                                [
                                    hl.struct(
                                        faf=ds.faf[g.faf_index_dict[f"{subset}_{pop_id}"]].faf99, population=pop_id,
                                    )
                                    for pop_id in (
                                        ["afr", "amr", "eas", "nfe"] + (["sas"] if exome_or_genome == "exome" else [])
                                    )
                                ]
                            ).filter(lambda f: f.faf > 0),
                            key=lambda f: (-f.faf, f.population),
                        ),
                        lambda fafs: hl.if_else(
                            hl.len(fafs) > 0,
                            hl.struct(popmax=fafs[0].faf, popmax_population=fafs[0].population,),
                            hl.struct(popmax=hl.null(hl.tfloat), popmax_population=hl.null(hl.tstr),),
                        ),
                    ),
                )
                for subset in (
                    ["gnomad", "controls", "non_neuro", "non_topmed"]
                    + (["non_cancer"] if exome_or_genome == "exome" else [])
                )
            }
        ),
    )

    ds = ds.drop("faf")

    ####################
    # Age distribution #
    ####################

    # Format age distributions
    ds = ds.transmute(
        age_distribution=hl.struct(
            **{
                subset: hl.struct(het=ds.age_hist_het[index], hom=ds.age_hist_hom[index],)
                for subset, index in g.age_index_dict.items()
            },
        )
    )

    ###################
    # Quality metrics #
    ###################

    ds = ds.transmute(
        quality_metrics=hl.struct(
            allele_balance=hl.struct(
                alt_raw=ds.ab_hist_alt.annotate(
                    bin_edges=ds.ab_hist_alt.bin_edges.map(lambda n: hl.float(hl.format("%.3f", n)))
                )
            ),
            genotype_depth=hl.struct(all_raw=ds.dp_hist_all, alt_raw=ds.dp_hist_alt),
            genotype_quality=hl.struct(all_raw=ds.gq_hist_all, alt_raw=ds.gq_hist_alt),
            # Use the same fields as the VCFs
            # Based https://github.com/macarthur-lab/gnomad_qc/blob/25a81bc2166fbe4ccbb2f7a87d36aba661150413/variant_qc/prepare_data_release.py#L128-L159
            site_quality_metrics=[
                hl.struct(metric="BaseQRankSum", value=ds.allele_info.BaseQRankSum),
                hl.struct(metric="ClippingRankSum", value=ds.allele_info.ClippingRankSum),
                hl.struct(metric="DP", value=hl.float(ds.allele_info.DP)),
                hl.struct(metric="FS", value=ds.info_FS),
                hl.struct(metric="InbreedingCoeff", value=ds.info_InbreedingCoeff),
                hl.struct(metric="MQ", value=ds.info_MQ),
                hl.struct(metric="MQRankSum", value=ds.info_MQRankSum),
                hl.struct(metric="pab_max", value=ds.pab_max),
                hl.struct(metric="QD", value=ds.info_QD),
                hl.struct(metric="ReadPosRankSum", value=ds.info_ReadPosRankSum),
                hl.struct(metric="RF", value=ds.rf_probability),
                hl.struct(metric="SiteQuality", value=ds.qual),
                hl.struct(metric="SOR", value=ds.info_SOR),
                hl.struct(metric="VQSLOD", value=ds.allele_info.VQSLOD),
                hl.struct(metric="VQSR_NEGATIVE_TRAIN_SITE", value=hl.float(ds.info_NEGATIVE_TRAIN_SITE)),
                hl.struct(metric="VQSR_POSITIVE_TRAIN_SITE", value=hl.float(ds.info_POSITIVE_TRAIN_SITE)),
            ],
        )
    )

    #################
    # Unused fields #
    #################

    ds = ds.drop(
        "adj_biallelic_rank",
        "adj_biallelic_singleton_rank",
        "adj_rank",
        "adj_singleton_rank",
        "allele_type",
        "biallelic_rank",
        "biallelic_singleton_rank",
        "has_star",
        "info_DP",
        "mills",
        "n_alt_alleles",
        "n_nonref",
        "omni",
        "popmax",
        "qd",
        "rank",
        "score",
        "singleton_rank",
        "singleton",
        "transmitted_singleton",
        "variant_type",
        "was_mixed",
        "was_split",
    )

    # These two fields appear only in the genomes table
    if "_score" in ds.row_value.dtype.fields:
        ds = ds.drop("_score", "_singleton")

    ds = ds.select(**{exome_or_genome: ds.row_value})

    return ds
Ejemplo n.º 18
0
@pytest.mark.parametrize(
    "input_regions,expected_output_regions",
    [
        (
            hl.literal([
                hl.utils.Struct(start=5, stop=10),
                hl.utils.Struct(start=7, stop=12),
                hl.utils.Struct(start=10, stop=11),
            ]),
            [hl.utils.Struct(start=5, stop=12)],
        ),
        (
            hl.literal([
                hl.utils.Struct(start=5, stop=10),
                hl.utils.Struct(start=11, stop=14),
                hl.utils.Struct(start=17, stop=22),
                hl.utils.Struct(start=22, stop=24),
            ]),
            [
                hl.utils.Struct(start=5, stop=14),
                hl.utils.Struct(start=17, stop=24),
            ],
        ),
        (hl.empty_array(hl.tstruct(start=hl.tint, stop=hl.tint)), []),
    ],
)
def test_merge_overlapping_regions(input_regions, expected_output_regions):
    assert hl.eval(
        merge_overlapping_regions(input_regions)) == expected_output_regions
Ejemplo n.º 19
0
def get_reference_ht(
    ref: hl.ReferenceGenome,
    contigs: Optional[List[str]] = None,
    excluded_intervals: Optional[List[hl.Interval]] = None,
    add_all_substitutions: bool = False,
    filter_n: bool = True,
) -> hl.Table:
    """
    Creates a reference Table with locus and alleles (containing only the reference allele by default) from the given reference genome.
    .. note::
        If the `contigs` argument is not provided, all contigs (including obscure ones) will be added to the table.
        This can be slow as contigs are added one by one.
    :param ref: Input reference genome
    :param contigs: An optional list of contigs that the Table should include
    :param excluded_intervals: An optional list of intervals to exclude
    :param add_all_substitutions: If set, then all possible substitutions are added in the alleles array
    :param filter_n: If set, bases where the reference is unknown (n) are filtered.
    :return:
    """
    if not ref.has_sequence():
        add_reference_sequence(ref)

    if not contigs:
        contigs = ref.contigs

    if add_all_substitutions:
        SUBSTITUTIONS_TABLE = hl.literal(
            {
                "a": ["c", "g", "t"],
                "c": ["a", "g", "t"],
                "g": ["a", "c", "t"],
                "t": ["a", "c", "g"],
            }
        )

    context = []
    for contig in contigs:
        n_partitions = max(1, int(ref.contig_length(contig) / 5000000))
        logger.info(
            f"Creating reference contig {contig} with {n_partitions} partitions."
        )
        _context = hl.utils.range_table(
            ref.contig_length(contig), n_partitions=n_partitions
        )

        locus_expr = hl.locus(contig=contig, pos=_context.idx + 1, reference_genome=ref)
        ref_allele_expr = locus_expr.sequence_context().lower()
        if add_all_substitutions:
            alleles_expr = hl.array([ref_allele_expr]).extend(
                SUBSTITUTIONS_TABLE.get(ref_allele_expr, hl.empty_array(hl.tstr))
            )
        else:
            alleles_expr = [ref_allele_expr]

        _context = (
            _context.select(locus=locus_expr, alleles=alleles_expr)
            .key_by("locus", "alleles")
            .drop("idx")
        )

        if excluded_intervals is not None:
            _context = hl.filter_intervals(_context, excluded_intervals, keep=False)

        if filter_n:
            _context = _context.filter(_context.alleles[0] == "n", keep=False)

        context.append(_context)

    return context.pop().union(*context)
Ejemplo n.º 20
0
 def field_to_array(ds, field):
     return hl.cond(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
Ejemplo n.º 21
0
def prepare_gnomad_v2_mnvs(mnvs_path, three_bp_mnvs_path):
    mnvs = import_mnv_file(mnvs_path, quote="'")
    mnvs_3bp = import_three_bp_mnv_file(three_bp_mnvs_path, quote="'")

    snp12_components = mnvs_3bp.select(
        component_mnv=hl.bind(
            lambda snv1, snv2: hl.delimit(
                [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref, snv1.alt + snv2.alt,], "-",
            ),
            mnvs_3bp.constituent_snvs[0],
            mnvs_3bp.constituent_snvs[1],
        ),
        related_mnv=hl.struct(
            combined_variant_id=mnvs_3bp.variant_id,
            n_individuals=mnvs_3bp.n_individuals,
            other_constituent_snvs=[mnvs_3bp.constituent_snvs[2].variant_id],
            consequences=mnvs_3bp.consequences,
        ),
    )
    snp23_components = mnvs_3bp.select(
        component_mnv=hl.bind(
            lambda snv2, snv3: hl.delimit(
                [snv2.chrom, hl.str(snv2.pos), snv2.ref + snv3.ref, snv2.alt + snv3.alt,], "-",
            ),
            mnvs_3bp.constituent_snvs[1],
            mnvs_3bp.constituent_snvs[2],
        ),
        related_mnv=hl.struct(
            combined_variant_id=mnvs_3bp.variant_id,
            n_individuals=mnvs_3bp.n_individuals,
            other_constituent_snvs=[mnvs_3bp.constituent_snvs[0].variant_id],
            consequences=mnvs_3bp.consequences,
        ),
    )
    snp13_components = mnvs_3bp.select(
        component_mnv=hl.bind(
            lambda snv1, snv2, snv3: hl.delimit(
                [snv1.chrom, hl.str(snv1.pos), snv1.ref + snv2.ref + snv3.ref, snv1.alt + snv2.ref + snv3.alt,], "-",
            ),
            mnvs_3bp.constituent_snvs[0],
            mnvs_3bp.constituent_snvs[1],
            mnvs_3bp.constituent_snvs[2],
        ),
        related_mnv=hl.struct(
            combined_variant_id=mnvs_3bp.variant_id,
            n_individuals=mnvs_3bp.n_individuals,
            other_constituent_snvs=[mnvs_3bp.constituent_snvs[1].variant_id],
            consequences=mnvs_3bp.consequences,
        ),
    )
    component_2bp_mnvs = snp12_components.union(snp13_components).union(snp23_components)
    component_2bp_mnvs = component_2bp_mnvs.group_by(component_2bp_mnvs.component_mnv).aggregate(
        related_mnvs=hl.agg.collect(component_2bp_mnvs.related_mnv)
    )

    mnvs = mnvs.annotate(related_mnvs=component_2bp_mnvs[mnvs.variant_id].related_mnvs)
    mnvs = mnvs.annotate(
        related_mnvs=hl.or_else(mnvs.related_mnvs, hl.empty_array(mnvs.related_mnvs.dtype.element_type))
    )
    mnvs = mnvs.annotate(
        related_mnvs=mnvs.related_mnvs.map(
            lambda related_mnv: related_mnv.select(
                "combined_variant_id",
                "n_individuals",
                "other_constituent_snvs",
                changes_amino_acids=hl.bind(
                    lambda mnv_consequences, related_mnv_consequences: mnv_consequences.key_set()
                    .union(related_mnv_consequences.key_set())
                    .any(lambda gene_id: mnv_consequences.get(gene_id) != related_mnv_consequences.get(gene_id)),
                    hl.dict(mnvs.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))),
                    hl.dict(related_mnv.consequences.map(lambda c: (c.gene_id, c.amino_acids.lower()))),
                ),
            )
        )
    )

    mnvs_3bp = mnvs_3bp.annotate(related_mnvs=hl.empty_array(mnvs.related_mnvs.dtype.element_type))

    mnvs = mnvs.union(mnvs_3bp)

    return mnvs
Ejemplo n.º 22
0
    '#CHROM	POS	ID	REF	ALT	QUAL	FILTER	INFO	FORMAT	SAMPLE-1	SAMPLE-2	SAMPLE-3	SAMPLE-4	SAMPLE-5',
    'chr1	10000	DUP_chr1_1	N	<DUP>	999	LOW_CALL_RATE	END=17000;SVTYPE=DUP;CHR2=chr1;SVLEN=7000;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;NONCODING_BREAKPOINT=DNase;AN=1428;AC=370;AF=0.259104;N_BI_GENOS=714;N_HOMREF=415;N_HET=228;N_HOMALT=71;FREQ_HOMREF=0.581232;FREQ_HET=0.319328;FREQ_HOMALT=0.0994398;MALE_AN=772;MALE_AC=214;MALE_AF=0.277202;MALE_N_BI_GENOS=386;MALE_N_HOMREF=216;MALE_N_HET=126;MALE_N_HOMALT=44;MALE_FREQ_HOMREF=0.559586;MALE_FREQ_HET=0.326425;MALE_FREQ_HOMALT=0.11399;FEMALE_AN=656;FEMALE_AC=156;FEMALE_AF=0.237805;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=199;FEMALE_N_HET=102;FEMALE_N_HOMALT=27;FEMALE_FREQ_HOMREF=0.606707;FEMALE_FREQ_HET=0.310976;FEMALE_FREQ_HOMALT=0.0823171	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/1:999:3:999:.:.:.:.:RD	0/1:52:3:52:.:.:.:.:RD	0/1:19:3:19:.:.:.:.:RD	0/0:1:2:1:.:.:.:.:RD	0/0:31:2:31:.:.:.:.:RD',
    'chr1	10000	DUP_chr1_2	N	<DUP>	999	LOW_CALL_RATE;UNRESOLVED	END=53500;SVTYPE=DUP;CHR2=chr1;SVLEN=43500;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;LINCRNA__COPY_GAIN=FAM138A,MIR1302-2HG;NONCODING_SPAN=DNase;NONCODING_BREAKPOINT=DNase;AN=1428;AC=70;AF=0.04902;N_BI_GENOS=714;N_HOMREF=649;N_HET=60;N_HOMALT=5;FREQ_HOMREF=0.908964;FREQ_HET=0.0840336;FREQ_HOMALT=0.0070028;MALE_AN=772;MALE_AC=46;MALE_AF=0.059585;MALE_N_BI_GENOS=386;MALE_N_HOMREF=344;MALE_N_HET=38;MALE_N_HOMALT=4;MALE_FREQ_HOMREF=0.891192;MALE_FREQ_HET=0.0984456;MALE_FREQ_HOMALT=0.0103627;FEMALE_AN=656;FEMALE_AC=24;FEMALE_AF=0.036585;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=305;FEMALE_N_HET=22;FEMALE_N_HOMALT=1;FEMALE_FREQ_HOMREF=0.929878;FEMALE_FREQ_HET=0.0670732;FEMALE_FREQ_HOMALT=0.00304878	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:1:2:1:.:.:.:.:RD	0/1:119:3:119:.:.:.:.:RD	0/1:119:3:119:.:.:.:.:RD	0/0:999:2:999:.:.:.:.:RD	0/0:133:2:133:.:.:.:.:RD',
    'chr1	10602	BND_chr1_1	N	<BND>	461	UNRESOLVED;UNSTABLE_AF_PCRMINUS	END=10602;SVTYPE=BND;CHR2=chr12;STRANDS=+-;SVLEN=-1;ALGORITHMS=manta;EVIDENCE=SR;UNRESOLVED_TYPE=SINGLE_ENDER_+-;END2=10546;AN=1428;AC=88;AF=0.061625;N_BI_GENOS=714;N_HOMREF=626;N_HET=88;N_HOMALT=0;FREQ_HOMREF=0.876751;FREQ_HET=0.123249;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=51;MALE_AF=0.066062;MALE_N_BI_GENOS=386;MALE_N_HOMREF=335;MALE_N_HET=51;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.867876;MALE_FREQ_HET=0.132124;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=37;FEMALE_AF=0.056402;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=291;FEMALE_N_HET=37;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.887195;FEMALE_FREQ_HET=0.112805;FEMALE_FREQ_HOMALT=0;gnomAD_V2_SVID=gnomAD-SV_v2.1_BND_1_1;gnomAD_V2_AF=0.00678599998354912	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	.:999:.:.:0:23:0:999:PE,SR	0/0:999:.:.:0:23:0:999:PE,SR	.:999:.:.:0:1:0:999:PE,SR	0/0:999:.:.:0:3:0:999:PE,SR	.:999:.:.:0:23:0:999:PE,SR',
    'chr1	41950	DUP_chr1_3	N	<DUP>	999	LOW_CALL_RATE	END=52000;SVTYPE=DUP;CHR2=chr1;SVLEN=10050;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;AN=1428;AC=28;AF=0.019608;N_BI_GENOS=714;N_HOMREF=687;N_HET=26;N_HOMALT=1;FREQ_HOMREF=0.962185;FREQ_HET=0.0364146;FREQ_HOMALT=0.00140056;MALE_AN=772;MALE_AC=15;MALE_AF=0.01943;MALE_N_BI_GENOS=386;MALE_N_HOMREF=371;MALE_N_HET=15;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.96114;MALE_FREQ_HET=0.0388601;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=13;FEMALE_AF=0.019817;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=316;FEMALE_N_HET=11;FEMALE_N_HOMALT=1;FEMALE_FREQ_HOMREF=0.963415;FEMALE_FREQ_HET=0.0335366;FEMALE_FREQ_HOMALT=0.00304878;gnomAD_V2_SVID=gnomAD-SV_v2.1_DUP_1_1;gnomAD_V2_AF=0.068962998688221	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:31:2:31:.:.:.:.:RD	0/0:58:2:58:.:.:.:.:RD	0/0:1:2:1:.:.:.:.:RD	0/0:112:2:112:.:.:.:.:RD	0/0:999:2:999:.:.:.:.:RD',
    'chr1	44000	DUP_chr1_4	N	<DUP>	999	UNSTABLE_AF_PCRMINUS;LOW_CALL_RATE	END=66000;SVTYPE=DUP;CHR2=chr1;SVLEN=22000;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__DUP_PARTIAL=OR4F5;NONCODING_SPAN=DNase;AN=1428;AC=96;AF=0.067227;N_BI_GENOS=714;N_HOMREF=641;N_HET=50;N_HOMALT=23;FREQ_HOMREF=0.897759;FREQ_HET=0.070028;FREQ_HOMALT=0.0322129;MALE_AN=772;MALE_AC=54;MALE_AF=0.069948;MALE_N_BI_GENOS=386;MALE_N_HOMREF=345;MALE_N_HET=28;MALE_N_HOMALT=13;MALE_FREQ_HOMREF=0.893782;MALE_FREQ_HET=0.0725389;MALE_FREQ_HOMALT=0.0336788;FEMALE_AN=656;FEMALE_AC=42;FEMALE_AF=0.064024;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=296;FEMALE_N_HET=22;FEMALE_N_HOMALT=10;FEMALE_FREQ_HOMREF=0.902439;FEMALE_FREQ_HET=0.0670732;FEMALE_FREQ_HOMALT=0.0304878	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:125:1:125:.:.:.:.:RD	0/0:72:2:72:.:.:.:.:RD	0/0:130:2:130:.:.:.:.:RD	0/0:1:2:1:.:.:.:.:RD	0/0:1:2:1:.:.:.:.:RD',
    'chr1	44250	DUP_chr1_5	N	<DUP>	999	LOW_CALL_RATE	END=116000;SVTYPE=DUP;CHR2=chr1;SVLEN=71750;ALGORITHMS=depth;EVIDENCE=BAF,RD;PROTEIN_CODING__COPY_GAIN=OR4F5;LINCRNA__COPY_GAIN=AL627309.3;LINCRNA__DUP_PARTIAL=AL627309.1;NONCODING_SPAN=DNase;AN=1428;AC=82;AF=0.057423;N_BI_GENOS=714;N_HOMREF=646;N_HET=54;N_HOMALT=14;FREQ_HOMREF=0.904762;FREQ_HET=0.0756303;FREQ_HOMALT=0.0196078;MALE_AN=772;MALE_AC=43;MALE_AF=0.055699;MALE_N_BI_GENOS=386;MALE_N_HOMREF=351;MALE_N_HET=27;MALE_N_HOMALT=8;MALE_FREQ_HOMREF=0.909326;MALE_FREQ_HET=0.0699482;MALE_FREQ_HOMALT=0.0207254;FEMALE_AN=656;FEMALE_AC=39;FEMALE_AF=0.059451;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=295;FEMALE_N_HET=27;FEMALE_N_HOMALT=6;FEMALE_FREQ_HOMREF=0.89939;FEMALE_FREQ_HET=0.0823171;FEMALE_FREQ_HOMALT=0.0182927	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:1:1:1:.:.:.:.:RD	0/0:36:2:36:.:.:.:.:RD	0/0:94:2:94:.:.:.:.:RD	0/0:130:1:130:.:.:.:.:RD	0/0:999:1:999:.:.:.:.:RD',
    'chr1	51400	DEL_chr1_1	N	<DEL>	999	UNSTABLE_AF_PCRMINUS	END=64000;SVTYPE=DEL;CHR2=chr1;SVLEN=12600;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;AN=1428;AC=306;AF=0.214286;N_BI_GENOS=714;N_HOMREF=443;N_HET=236;N_HOMALT=35;FREQ_HOMREF=0.620448;FREQ_HET=0.330532;FREQ_HOMALT=0.0490196;MALE_AN=772;MALE_AC=156;MALE_AF=0.202073;MALE_N_BI_GENOS=386;MALE_N_HOMREF=246;MALE_N_HET=124;MALE_N_HOMALT=16;MALE_FREQ_HOMREF=0.637306;MALE_FREQ_HET=0.321244;MALE_FREQ_HOMALT=0.0414508;FEMALE_AN=656;FEMALE_AC=150;FEMALE_AF=0.228659;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=197;FEMALE_N_HET=112;FEMALE_N_HOMALT=19;FEMALE_FREQ_HOMREF=0.60061;FEMALE_FREQ_HET=0.341463;FEMALE_FREQ_HOMALT=0.0579268	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/1:125:1:125:.:.:.:.:RD	0/0:72:2:72:.:.:.:.:RD	0/0:112:2:112:.:.:.:.:RD	0/0:1:2:1:.:.:.:.:RD	0/0:8:2:8:.:.:.:.:RD',
    'chr1	52600	CNV_chr1_1	N	<CNV>	999	FAIL_minGQ	END=58000;SVTYPE=CNV;CHR2=chr1;SVLEN=5400;ALGORITHMS=depth;EVIDENCE=RD;PROTEIN_CODING__NEAREST_TSS=OR4F5;PROTEIN_CODING__INTERGENIC;NONCODING_SPAN=DNase;AN=0;AC=0;AF=0;MALE_AN=0;MALE_AC=0;MALE_AF=0;FEMALE_AN=0;FEMALE_AC=0;FEMALE_AF=0	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV:CN:CNQ	.:.:1:125:.:.:.:.:RD:1:125	.:.:2:130:.:.:.:.:RD:2:130	.:.:2:23:.:.:.:.:RD:2:23	.:.:2:1:.:.:.:.:RD:2:1	.:.:2:1:.:.:.:.:RD:2:1',
    'chr1	66234	BND_chr1_2	N	<BND>	807	UNRESOLVED	END=66234;SVTYPE=BND;CHR2=chr19;STRANDS=-+;SVLEN=-1;ALGORITHMS=manta;EVIDENCE=PE;UNRESOLVED_TYPE=SINGLE_ENDER_-+;END2=108051;AN=1428;AC=236;AF=0.165266;N_BI_GENOS=714;N_HOMREF=514;N_HET=164;N_HOMALT=36;FREQ_HOMREF=0.719888;FREQ_HET=0.229692;FREQ_HOMALT=0.0504202;MALE_AN=772;MALE_AC=131;MALE_AF=0.169689;MALE_N_BI_GENOS=386;MALE_N_HOMREF=275;MALE_N_HET=91;MALE_N_HOMALT=20;MALE_FREQ_HOMREF=0.712435;MALE_FREQ_HET=0.235751;MALE_FREQ_HOMALT=0.0518135;FEMALE_AN=656;FEMALE_AC=105;FEMALE_AF=0.160061;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=239;FEMALE_N_HET=73;FEMALE_N_HOMALT=16;FEMALE_FREQ_HOMREF=0.728659;FEMALE_FREQ_HET=0.222561;FEMALE_FREQ_HOMALT=0.0487805	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:999:.:.:0:23:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR',
    'chr1	1495464	CPX_chr1_1	N	<CPX>	999	PASS	END=1495554;SVTYPE=CPX;CHR2=chr1;SVLEN=184;ALGORITHMS=manta;EVIDENCE=PE;CPX_TYPE=dDUP;SOURCE=DUP_chr1:1533874-1534058;CPX_INTERVALS=DUP_chr1:1533874-1534058;PROTEIN_CODING__DUP_PARTIAL=ATAD3A;PROTEIN_CODING__INTRONIC=ATAD3A;AN=1428;AC=7;AF=0.004902;N_BI_GENOS=714;N_HOMREF=707;N_HET=7;N_HOMALT=0;FREQ_HOMREF=0.990196;FREQ_HET=0.00980392;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=4;MALE_AF=0.005181;MALE_N_BI_GENOS=386;MALE_N_HOMREF=382;MALE_N_HET=4;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.989637;MALE_FREQ_HET=0.0103627;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=3;FEMALE_AF=0.004573;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=325;FEMALE_N_HET=3;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.990854;FEMALE_FREQ_HET=0.00914634;FEMALE_FREQ_HOMALT=0	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/1:782:.:.:1:782:1:1:PE,SR',
    'chr1	1643228	INS_chr1_10	N	<INS:ME:SVA>	250	PASS	END=1643309;SVTYPE=INS;CHR2=chr1;SVLEN=169;ALGORITHMS=melt;EVIDENCE=SR;PROTEIN_CODING__INTRONIC=CDK11B;AN=1428;AC=11;AF=0.007703;N_BI_GENOS=714;N_HOMREF=703;N_HET=11;N_HOMALT=0;FREQ_HOMREF=0.984594;FREQ_HET=0.0154062;FREQ_HOMALT=0;MALE_AN=772;MALE_AC=5;MALE_AF=0.006477;MALE_N_BI_GENOS=386;MALE_N_HOMREF=381;MALE_N_HET=5;MALE_N_HOMALT=0;MALE_FREQ_HOMREF=0.987047;MALE_FREQ_HET=0.0129534;MALE_FREQ_HOMALT=0;FEMALE_AN=656;FEMALE_AC=6;FEMALE_AF=0.009146;FEMALE_N_BI_GENOS=328;FEMALE_N_HOMREF=322;FEMALE_N_HET=6;FEMALE_N_HOMALT=0;FEMALE_FREQ_HOMREF=0.981707;FEMALE_FREQ_HET=0.0182927;FEMALE_FREQ_HOMALT=0;gnomAD_V2_SVID=gnomAD-SV_v2.1_INS_1_47;gnomAD_V2_AF=0.00130899995565414	GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/0:999:.:.:0:999:0:24:PE,SR	0/0:999:.:.:0:999:0:999:PE,SR	0/1:1:.:.:0:999:1:1:SR',
]

NULL_STR_ARRAY = hl.null(hl.dtype('array<str>'))
EMPTY_STR_ARRAY = hl.empty_array(hl.dtype('str'))
NULL_INTERVALS = hl.null(
    hl.dtype('array<struct{type: str, chrom: str, start: int32, end: int32}>'))
VARIANT_CPX = hl.struct(variantId='CPX_chr1_1',
                        contig='1',
                        sc=7,
                        sf=0.004902,
                        sn=1428,
                        start=1495464,
                        end=1495554,
                        sv_callset_Het=7,
                        sv_callset_Hom=0,
                        gnomad_svs_ID=hl.null('str'),
                        gnomad_svs_AF=hl.null('float'),
                        pos=1495464,
                        filters=NULL_STR_ARRAY,
Ejemplo n.º 23
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gencode",
        action="append",
        default=[],
        metavar=("version", "gtf_path", "canonical_transcripts_path"),
        nargs=3,
        required=True,
    )
    parser.add_argument("--hgnc")
    parser.add_argument("--mane-select-transcripts")
    parser.add_argument("--min-partitions", type=int, default=32)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    genes = None

    all_gencode_versions = [
        gencode_version for gencode_version, _, _ in args.gencode
    ]

    for gencode_version, gtf_path, canonical_transcripts_path in args.gencode:
        gencode_genes = load_gencode_gene_models(
            gtf_path, min_partitions=args.min_partitions)

        # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row
        canonical_transcripts = hl.import_table(
            canonical_transcripts_path,
            key="gene_id",
            min_partitions=args.min_partitions)
        gencode_genes = gencode_genes.annotate(
            canonical_transcript_id=canonical_transcripts[
                gencode_genes.gene_id].transcript_id)

        gencode_genes = gencode_genes.select(
            **{f"v{gencode_version}": gencode_genes.row_value})

        if not genes:
            genes = gencode_genes
        else:
            genes = genes.join(gencode_genes, "outer")

    genes = genes.select(gencode=genes.row_value)

    hgnc = hl.import_table(args.hgnc, missing="")

    hgnc = hgnc.select(
        hgnc_id=hgnc["HGNC ID"],
        symbol=hgnc["Approved symbol"],
        name=hgnc["Approved name"],
        previous_symbols=hgnc["Previous symbols"],
        alias_symbols=hgnc["Alias symbols"],
        omim_id=hgnc["OMIM ID(supplied by OMIM)"],
        gene_id=hl.or_else(hgnc["Ensembl gene ID"],
                           hgnc["Ensembl ID(supplied by Ensembl)"]),
    )
    hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id")
    hgnc = hgnc.annotate(
        previous_symbols=hl.cond(
            hgnc.previous_symbols == "",
            hl.empty_array(hl.tstr),
            hgnc.previous_symbols.split(",").map(lambda s: s.strip()),
        ),
        alias_symbols=hl.cond(
            hgnc.alias_symbols == "", hl.empty_array(hl.tstr),
            hgnc.alias_symbols.split(",").map(lambda s: s.strip())),
    )

    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol),
                                                 "hgnc", hl.null(hl.tstr)))

    # If an HGNC gene symbol was not present, use the symbol from Gencode
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(
            symbol=hl.or_else(
                genes.symbol,
                genes.gencode[f"v{gencode_version}"].gene_symbol),
            symbol_source=hl.cond(
                hl.is_missing(genes.symbol) & hl.is_defined(
                    genes.gencode[f"v{gencode_version}"].gene_symbol),
                f"gencode (v{gencode_version})",
                genes.symbol_source,
            ),
        )

    # Collect all fields that can be used to search by gene name
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend(
            genes.previous_symbols).extend(genes.alias_symbols),
    )
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(search_terms=hl.rbind(
            genes.gencode[f"v{gencode_version}"].gene_symbol,
            lambda symbol_in_gencode: hl.cond(
                hl.is_defined(symbol_in_gencode),
                genes.search_terms.append(symbol_in_gencode), genes.
                search_terms),
        ))

    genes = genes.annotate(
        search_terms=hl.set(genes.search_terms.map(lambda s: s.upper())))

    if args.mane_select_transcripts:
        mane_select_transcripts = hl.import_table(args.mane_select_transcripts,
                                                  force=True)
        mane_select_transcripts = mane_select_transcripts.select(
            gene_id=mane_select_transcripts.Ensembl_Gene.split("\\.")[0],
            matched_gene_version=mane_select_transcripts.Ensembl_Gene.split(
                "\\.")[1],
            ensembl_id=mane_select_transcripts.Ensembl_nuc.split("\\.")[0],
            ensembl_version=mane_select_transcripts.Ensembl_nuc.split("\\.")
            [1],
            refseq_id=mane_select_transcripts.RefSeq_nuc.split("\\.")[0],
            refseq_version=mane_select_transcripts.RefSeq_nuc.split("\\.")[1],
        )
        mane_select_transcripts = mane_select_transcripts.key_by("gene_id")

        # For GRCh38 (Gencode >= 20) transcripts, use the MANE Select transcripts to annotate transcripts
        # with their matching RefSeq transcript.
        ensembl_to_refseq_map = {}
        for transcript in mane_select_transcripts.collect():
            ensembl_to_refseq_map[transcript.ensembl_id] = {
                transcript.ensembl_version:
                hl.Struct(refseq_id=transcript.refseq_id,
                          refseq_version=transcript.refseq_version)
            }

        ensembl_to_refseq_map = hl.literal(ensembl_to_refseq_map)

        for gencode_version in ["19", "29"]:
            if int(gencode_version) >= 20:
                transcript_annotation = lambda transcript: transcript.annotate(
                    **ensembl_to_refseq_map.get(
                        transcript.transcript_id,
                        hl.empty_dict(
                            hl.tstr,
                            hl.tstruct(refseq_id=hl.tstr,
                                       refseq_version=hl.tstr)),
                    ).get(
                        transcript.transcript_version,
                        hl.struct(refseq_id=hl.null(hl.tstr),
                                  refseq_version=hl.null(hl.tstr)),
                    ))
            else:
                transcript_annotation = lambda transcript: transcript.annotate(
                    refseq_id=hl.null(hl.tstr),
                    refseq_version=hl.null(hl.tstr))

            genes = genes.annotate(gencode=genes.gencode.annotate(
                **{
                    f"v{gencode_version}":
                    genes.gencode[f"v{gencode_version}"].annotate(
                        transcripts=genes.gencode[f"v{gencode_version}"].
                        transcripts.map(transcript_annotation))
                }))

        # Annotate genes with their MANE Select transcript
        genes = genes.annotate(
            mane_select_transcript=mane_select_transcripts[genes.gene_id])

    genes.describe()

    genes.write(args.output, overwrite=True)
Ejemplo n.º 24
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--gencode",
        action="append",
        default=[],
        metavar=("version", "gtf_path", "canonical_transcripts_path"),
        nargs=3,
        required=True,
    )
    parser.add_argument("--hgnc")
    parser.add_argument("--min-partitions", type=int, default=32)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    genes = None

    all_gencode_versions = [gencode_version for gencode_version, _, _ in args.gencode]

    for gencode_version, gtf_path, canonical_transcripts_path in args.gencode:
        gencode_genes = load_gencode_gene_models(gtf_path, min_partitions=args.min_partitions)

        # Canonical transcripts file is a TSV with two columns: gene ID and transcript ID and no header row
        canonical_transcripts = hl.import_table(
            canonical_transcripts_path, key="gene_id", min_partitions=args.min_partitions
        )
        gencode_genes = gencode_genes.annotate(
            canonical_transcript_id=canonical_transcripts[gencode_genes.gene_id].transcript_id
        )

        gencode_genes = gencode_genes.select(**{f"v{gencode_version}": gencode_genes.row_value})

        if not genes:
            genes = gencode_genes
        else:
            genes = genes.join(gencode_genes, "outer")

    genes = genes.select(gencode=genes.row_value)

    hgnc = hl.import_table(args.hgnc)

    # Fix for alternative HGNC column name
    try:
        hgnc = hgnc.rename({'Alias symbols': 'Synonyms'})
    except:
        pass

    hgnc = hgnc.select(
        hgnc_id=hgnc["HGNC ID"],
        symbol=hgnc["Approved symbol"],
        name=hgnc["Approved name"],
        previous_symbols=hgnc["Previous symbols"],
        synonyms=hgnc["Synonyms"],
        omim_id=hgnc["OMIM ID(supplied by OMIM)"],
        gene_id=hgnc["Ensembl ID(supplied by Ensembl)"],
    )
    hgnc = hgnc.key_by("gene_id")
    hgnc = hgnc.annotate(
        previous_symbols=hl.cond(
            hgnc.previous_symbols == "",
            hl.empty_array(hl.tstr),
            hgnc.previous_symbols.split(",").map(lambda s: s.strip()),
        ),
        synonyms=hl.cond(
            hgnc.synonyms == "", hl.empty_array(hl.tstr), hgnc.synonyms.split(",").map(lambda s: s.strip())
        ),
    )

    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr)))

    # If an HGNC gene symbol was not present, use the symbol from Gencode
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(
            symbol=hl.or_else(genes.symbol, genes.gencode[f"v{gencode_version}"].gene_symbol),
            symbol_source=hl.cond(
                hl.is_missing(genes.symbol) & hl.is_defined(genes.gencode[f"v{gencode_version}"].gene_symbol),
                f"gencode (v{gencode_version})",
                genes.symbol_source,
            ),
        )

    # Collect all fields that can be used to search by gene name
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.empty_array(hl.tstr).append(genes.symbol).extend(genes.synonyms).extend(genes.previous_symbols),
    )
    for gencode_version in all_gencode_versions:
        genes = genes.annotate(
            search_terms=hl.rbind(
                genes.gencode[f"v{gencode_version}"].gene_symbol,
                lambda symbol_in_gencode: hl.cond(
                    hl.is_defined(symbol_in_gencode), genes.search_terms.append(symbol_in_gencode), genes.search_terms
                ),
            )
        )

    genes = genes.annotate(search_terms=hl.set(genes.search_terms.map(lambda s: s.upper())))

    genes.describe()

    genes.write(args.output, overwrite=True)
Ejemplo n.º 25
0
'''

# Merging the female PAR region with the male PAR region
mt_x_par = femaleX_par.union_cols(maleX_par)

# Annotating the matrix tables with variant QC data
mt_x_list = [hl.variant_qc(mt, name='variant_qc') for mt in mt_x_list]

intervals = [
    hl.parse_locus_interval(x)
    for x in ['X:60001-2699520', 'X:154931044-155260560']
]
mt_x_list = [hl.filter_intervals(mt, intervals, keep=True) for mt in mt_x_list]

# Creating lists for variant call rate, hwe pval and maf in joined mt
mt_x_par = mt_x_par.annotate_rows(var_call_rate=hl.empty_array('float64'))
mt_x_par = mt_x_par.annotate_rows(hwe_pval=hl.empty_array('float64'))
mt_x_par = mt_x_par.annotate_rows(maf=hl.empty_array('float64'))

# Annotating var_call_rate list with variant call rates
for mt_next in mt_x_list:
    mt_x_par = mt_x_par.annotate_rows(
        var_call_rate=mt_x_par.var_call_rate.append(
            mt_next.index_rows(mt_x_par.row_key).variant_qc.call_rate))

# Annotating hwe_pval list with hwe pvals
for mt_next in mt_x_list:
    mt_x_par = mt_x_par.annotate_rows(hwe_pval=mt_x_par.hwe_pval.append((
        mt_next.index_rows(mt_x_par.row_key).variant_qc.p_value_hwe)))

# Annotating maf list with mafs
Ejemplo n.º 26
0
        mt = mt_list[i].union_cols(mt_list[i + 1])
    else:
        mt0 = mt.union_cols(mt_list[i + 1])
        mt = mt0

# In[94]:

# Creating a list of site IDs to annotate globals - for the purpose of keeping track of the order of variant data in variant arrays
siteIDs = hl.array(['ID', 'ID', 'ID'])

mt = mt.annotate_globals(location=siteIDs)

# In[95]:

# Creating lists for variant call rate, hwe pval and maf in joined mt
mt = mt.annotate_rows(var_call_rate=hl.empty_array('float64'))
mt = mt.annotate_rows(hwe_pval=hl.empty_array('float64'))
mt = mt.annotate_rows(maf=hl.empty_array('float64'))

# In[96]:

# Annotating var_call_rate list with variant call rates for each location
for mt_next in mt_list:
    mt = mt.annotate_rows(var_call_rate=mt.var_call_rate.append(
        mt_next.index_rows(mt.row_key).variant_qc.call_rate))

# In[98]:

# Annotating hwe_pval list with hwe pvals for each location
for mt_next in mt_list:
    mt = mt.annotate_rows(hwe_pval=mt.hwe_pval.append((
Ejemplo n.º 27
0
def infer_families(
        kin_ht: hl.Table,  # the kinship hail table
        sex: Dict[str, bool],  # the dictionary of sexes
        i_col:
    str = 'i',  # the rest of these are default that can be set to something else if needed
        j_col: str = 'j',
        pi_hat_col: str = 'pi_hat',
        ibd2_col: str = 'ibd2',
        ibd1_col: str = 'ibd1',
        ibd0_col: str = 'ibd0',
        first_degree_threshold: Tuple[float, float] = (0.4, 0.75),
        second_degree_threshold: Tuple[float, float] = (0.195, 0.3),
        ibd1_second_degree_threshold: float = 0.40,
        ibd2_parent_offspring_threshold: float = 0.30,
        ibd1_parent_offspring_threshold: float = 0.70,
        ibd0_parent_offspring_threshold: float = 0.15) -> hl.Pedigree:
    """
    Infers familial relationships from the results of pc_relate and sex information.
    Note that both kinship and ibd2 are needed in the pc_relate output.
    This function returns a pedigree containing trios inferred from the data. Family ID can be the same for multiple
    trios if one or more members of the trios are related (e.g. sibs, multi-generational family). Trios are ordered by family ID.
    Note that this function only returns complete trios defined as:
    one child, one father and one mother (sex is required for both parents)
    :param Table kin_ht: pc_relate output table
    :param dict of str -> bool sex: A dict containing the sex for each sample. True = female, False = male, None = unknown
    :param str i_col: Column containing the 1st sample id in the ibd table
    :param str j_col: Column containing the 2nd sample id in the ibd table
    #:param str kin_col: Column containing the kinship in the ibd table
    :param str pi_hat_col: Column containing the pi_hat in the ibd table
    :param str ibd2_col: Column containing ibd2 in the pc_relate table
    :param (float, float) first_degree_threshold: Lower/upper bounds for kin for 1st degree relatives
    :param (float, float) second_degree_threshold: Lower/upper bounds for kin for 2nd degree relatives
    :param float ibd2_parent_offspring_threshold: Upper bound on ibd2 for a parent/offspring
    :return: Pedigree containing all trios in the data
    :rtype: Pedigree
    """
    def get_fam_samples(
        sample: str,
        fam: Set[str],
        samples_rel: Dict[str, Set[str]],
    ) -> Set[str]:
        """
        Given a sample, its known family and a dict that links samples with their relatives, outputs the set of
        samples that constitute this sample family.
        :param str sample: sample
        :param dict of str -> set of str samples_rel: dict(
        :param set of str fam: sample known family
        :return: Family including the sample
        :rtype: set of str
        """
        fam.add(
            sample
        )  # usually this starts out as a blank set except for the case two lines below
        for s2 in samples_rel[
                sample]:  # iterate through the sample's relatives
            if s2 not in fam:
                fam = get_fam_samples(
                    s2, fam, samples_rel
                )  # this part is to get who s2 is related to but that sample may not have been related to?
        return fam

    def get_indexed_ibd(
            pc_relate_rows: List[hl.Struct]) -> Dict[Tuple[str, str], float]:
        """
        Given rows from a pc_relate table, creates dicts with:
        keys: Pairs of individuals, lexically ordered
        values: ibd2, ibd1, ibd0
        :param list of hl.Struct pc_relate_rows: Rows from a pc_relate table
        :return: Dict of lexically ordered pairs of individuals -> kinship
        :rtype: dict of (str, str) -> float
        """
        ibd2 = dict()
        ibd1 = dict()
        ibd0 = dict()
        for row in pc_relate_rows:
            ibd2[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd2_col]  # this is just getting the ibd2 value for every sample pair
            ibd1[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd1_col]  # this is just getting the ibd1 value for every sample pair
            ibd0[tuple(sorted((row[i_col], row[j_col])))] = row[
                ibd0_col]  # this is just getting the ibd0 value for every sample pair

        return ibd2, ibd1, ibd0

    def get_parents(possible_parents: List[str],
                    relative_pairs: List[Tuple[str, str]],
                    sex: Dict[str, bool]) -> Union[Tuple[str, str], None]:
        """
        Given a list of possible parents for a sample (first degree relatives with low ibd2),
        looks for a single pair of samples that are unrelated with different sexes.
        If a single pair is found, return the pair (father, mother)
        :param list of str possible_parents: Possible parents
        :param list of (str, str) relative_pairs: Pairs of relatives, used to check that parents aren't related with each other
        :param dict of str -> bool sex: Dict mapping samples to their sex (True = female, False = male, None or missing = unknown)
        :return: (father, mother) if found, `None` otherwise
        :rtype: (str, str) or None
        """

        parents = []
        logging.info(f"You have {len(possible_parents)} possible parent(s)")
        while len(possible_parents
                  ) > 1:  # go through the entire list of possible parents
            p1 = possible_parents.pop()  # start with the first possible parent

            for p2 in possible_parents:
                logging.info(str(tuple(sorted((p1, p2)))) + '\n')

                if tuple(
                        sorted((p1, p2))
                ) not in relative_pairs:  # to what degree is a "relative"? will this work for grandparent, mom, child?
                    logging.info(
                        "your potential parent's don't appear to be relatives\n"
                    )
                    logging.info("SEX p1: " + str(sex.get(p1)) + '\n')
                    logging.info("SEX p2: " + str(sex.get(p2)) + '\n')

                    if sex.get(p1) is False and sex.get(p2):
                        parents.append((p1, p2))
                        logging.info("found in order 1\n")
                    elif sex.get(p1) and sex.get(p2) is False:
                        parents.append((p2, p1))
                        logging.info("found in order 2\n")
                else:
                    logging.info("Your Parents are Related!!!\n\n")

        if len(parents) == 1:
            logging.info("Found your parents!\n")
            return parents[0]

        return None

    # Duplicated samples to remove (If not provided, this function won't work as it assumes that each child has exactly two parents)
    duplicated_samples = set()
    try:
        dups = hl.literal(duplicated_samples)
    except:
        dups = hl.empty_array(hl.tstr)

    first_degree_pairs = kin_ht.filter(
        (kin_ht[pi_hat_col] >= first_degree_threshold[0])
        & (kin_ht[pi_hat_col] <= first_degree_threshold[1])
        & ~dups.contains(kin_ht[i_col]) &
        ~dups.contains(kin_ht[j_col])  # so not including any duplicate samples
    ).collect()

    first_degree_relatives = defaultdict(set)
    for row in first_degree_pairs:
        first_degree_relatives[row[i_col]].add(
            row[j_col]
        )  # so you're making a list for every sample that includes any other sample they are related to by first degree
        first_degree_relatives[row[j_col]].add(row[i_col])

    # Add second degree relatives for those samples
    # This is needed to distinguish grandparent - child - parent from child - mother, father down the line
    first_degree_samples = hl.literal(set(first_degree_relatives.keys()))

    second_degree_samples = kin_ht.filter((
        (kin_ht[pi_hat_col] >= first_degree_threshold[0])
        & (kin_ht[pi_hat_col] <= first_degree_threshold[1])) | (
            (kin_ht[pi_hat_col] >= second_degree_threshold[0])
            & (kin_ht[ibd1_col] >= ibd1_second_degree_threshold)
            & (kin_ht[pi_hat_col] < second_degree_threshold[1]))).collect()

    ibd2, ibd1, ibd0 = get_indexed_ibd(
        second_degree_samples
    )  # this is just getting the ibd values for every sample pair

    fam_id = 1
    trios = []
    duos = []
    decisions = {}
    while len(first_degree_relatives) > 0:
        s_fam = get_fam_samples(
            list(first_degree_relatives)[0], set(), first_degree_relatives
        )  # just feed in the entire dictionary because it gets keyed out to only that sample in the function anyway
        for s in s_fam:
            logging.info(f"Processing sample: {s}")
            s_rel = first_degree_relatives.pop(
                s
            )  # because your popping, the above index of [0] will appropriately be updated
            possible_parents = []
            for rel in s_rel:  # so s rel is a list of all the people s (which was popped off) was related to by first degree

                if (ibd2[tuple(sorted((s, rel)))] <= ibd2_parent_offspring_threshold) & \
                    (ibd1[tuple(sorted((s, rel)))] >= ibd1_parent_offspring_threshold) &  \
                    (ibd0[tuple(sorted((s, rel)))] <= ibd0_parent_offspring_threshold): # if the ib2 value for that pair is below that parent threshold
                    possible_parents.append(rel)

            #these will be the proband-offspring only pairs
            if len(possible_parents) == 1:
                duos.append(sorted((s, possible_parents[0])))
                decisions[s] = possible_parents[0]
            else:
                parents = get_parents(possible_parents, list(ibd2.keys()), sex)

                decisions[s] = parents

                if parents is not None:  # just formatting the trio output here
                    trios.append(
                        hl.Trio(s=s,
                                fam_id=str(fam_id),
                                pat_id=parents[0],
                                mat_id=parents[1],
                                is_female=sex.get(s)))

        fam_id += 1

    return hl.Pedigree(trios), duos, decisions
Ejemplo n.º 28
0
 def field_to_array(ds, field):
     return hl.if_else(ds[field] != 0, hl.array([field]), hl.empty_array(hl.tstr))
Ejemplo n.º 29
0
def samples_qc(mt, mt_to_annotate, args):
    """
    Performs samples QC on a matrix table, removing samples on chimera and contamination %, as well as being +/- 4
    standard deviations from mean on TiTv, het/homvar, insertion/deletion ratios and n_singletons for a specific
    batch or cohort

    :param mt: matrix table, low-pass failing variants and genotypes filtered out
    :param mt_to_annotate: matrix table to annotate with failing samples information after calculating on filtered mt
    :param args:
    :return: returns annotated, unfiltered matrix table
    """
    datestr = time.strftime("%Y.%m.%d")

    # Run variant QC to get up to date variant QC metrics for samples QC
    mt = hl.sample_qc(mt)

    # Pull data to cols and checkpoint
    mt_cols = mt.cols()
    mt_cols = mt_cols.checkpoint("samples_qc_cols_tmp.ht", overwrite=True)

    # Instantiate empty array for failing samples QC tags
    mt_cols = mt_cols.annotate(failing_samples_qc=hl.empty_array(hl.tstr))

    ############################################################
    # Find samples failing on chimeras or contamination values #
    ############################################################
    mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
        (mt_cols[args.chimeras_col] > args.chimeras_max)
        & hl.is_defined(mt_cols[args.chimeras_col]),
        mt_cols.failing_samples_qc.append(
            "failing_chimeras"), mt_cols.failing_samples_qc))

    mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
        (mt_cols[args.contamination_col] > args.contamination_max)
        & hl.is_defined(mt_cols[args.contamination_col]),
        mt_cols.failing_samples_qc.append(
            "failing_contamination"), mt_cols.failing_samples_qc))

    failing_chim = mt_cols.aggregate(
        hl.agg.count_where(
            mt_cols.failing_samples_qc.contains("failing_chimeras")))
    miss_chim = mt_cols.aggregate(
        hl.agg.count_where(~(hl.is_defined(mt_cols[args.chimeras_col]))))
    failing_contam = mt_cols.aggregate(
        hl.agg.count_where(
            mt_cols.failing_samples_qc.contains("failing_contamination")))
    miss_contam = mt_cols.aggregate(
        hl.agg.count_where(~(hl.is_defined(mt_cols[args.contamination_col]))))

    logging.info(
        f"Number of samples failing on chimeras % > {args.chimeras_max}: {failing_chim}"
    )
    logging.info(f"Number of samples missing chimeras %: {miss_chim}")
    logging.info(
        f"Number of samples failing on contamination % > {args.contamination_max}: {failing_contam}"
    )
    logging.info(f"Number of samples missing contamination %: {miss_contam}")

    chim_stats = mt_cols.aggregate(hl.agg.stats(mt_cols[args.chimeras_col]))
    cont_stats = mt_cols.aggregate(
        hl.agg.stats(mt_cols[args.contamination_col]))
    logging.info(f"Chimeras statistics: {chim_stats}")
    logging.info(f"Contamination statistics: {cont_stats}")

    ###############################################
    # Find samples failing on sex-aware call rate #
    ###############################################
    if args.sample_call_rate is not None:
        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            (mt_cols.sexaware_sample_call_rate < args.sample_call_rate)
            & hl.is_defined(mt_cols.sexaware_sample_call_rate),
            mt_cols.failing_samples_qc.append(
                "failing_sexaware_sample_call_rate"),
            mt_cols.failing_samples_qc))

        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            ~(hl.is_defined(mt_cols.sexaware_sample_call_rate)),
            mt_cols.failing_samples_qc.append(
                "missing_sexaware_sample_call_rate"),
            mt_cols.failing_samples_qc))

        failing_cr = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(
                    "failing_sexaware_sample_call_rate")))
        missing_cr = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(
                    "missing_sexaware_sample_call_rate")))

        logging.info(
            f"Number of samples failing on sex-aware call rate > {args.sample_call_rate}: {failing_cr}"
        )
        logging.info(
            f"Number of samples missing sex-aware call rate : {missing_cr}")

        cr_stats = mt_cols.aggregate(
            hl.agg.stats(mt_cols.sexaware_sample_call_rate))

        logging.info(f"Sex-aware call rate statistics: {cr_stats}")

    ######################################################################################
    # Find samples failing per-cohort on titv, het_homvar ratio, indel, and # singletons #
    ######################################################################################
    if args.batch_col_name is not None:
        batch_none = mt_cols.aggregate(
            hl.agg.count_where(~(hl.is_defined(mt_cols[args.batch_col_name]))))
        mt_cols = mt_cols.annotate(
            **{
                args.batch_col_name:
                hl.or_else(mt_cols[args.batch_col_name], "no_batch_info")
            })

        if batch_none > 0:
            logging.info(
                f"Warning- {batch_none} samples have batch undefined. These samples will be grouped in one"
                f"batch for sample QC (named no_batch_info).")
            mt_cols.filter_cols(mt_cols[args.batch_col_name] ==
                                "no_batch_info").s.show(batch_none + 1)

        batch_set = mt_cols.aggregate(
            hl.agg.collect_as_set(mt_cols[args.batch_col_name]))
    else:
        args.batch_col_name = "mock_batch_col"
        mt_cols = mt_cols.annotate(mock_batch_col="all")
        batch_set = ["all"]

    # Convert batch strings to numeric values, create label for plotting
    batch_set_numeric = list(range(len(batch_set)))
    batch_key = list(zip(batch_set, batch_set_numeric))

    mt_cols = mt_cols.annotate(plot_batch=0)
    for batch in batch_key:
        mt_cols = mt_cols.annotate(
            plot_batch=hl.cond(mt_cols[args.batch_col_name] == batch[0],
                               batch[1], mt_cols.plot_batch))
        mt_cols = mt_cols.annotate(plot_batch_jitter=mt_cols.plot_batch +
                                   hl.rand_unif(-0.3, 0.3))

    batch_thresholds = {}
    batch_statistics = {}
    for measure in [
            'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton'
    ]:
        logging.info(f"Performing sample QC for measure {measure}")

        # Instantiate/reset box plot label
        mt_cols = mt_cols.annotate(boxplot_label=mt_cols[args.batch_col_name])

        batch_thresholds[measure] = {}
        batch_statistics[measure] = {}

        mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
            ~(hl.is_defined(mt_cols.sample_qc[measure])),
            mt_cols.failing_samples_qc.append(f"missing_{measure}"),
            mt_cols.failing_samples_qc))

        for batch in batch_set:
            # See if values exist at all for all values
            defined_values = mt_cols.aggregate(
                hl.agg.count_where(hl.is_defined(mt_cols.sample_qc[measure])))

            if defined_values > 0:
                # Get mean and standard deviation for each measure, for each batch's samples
                stats = mt_cols.aggregate(
                    hl.agg.filter(mt_cols[args.batch_col_name] == batch,
                                  hl.agg.stats(mt_cols.sample_qc[measure])))

                # Get cutoffs for each measure
                cutoff_upper = stats.mean + (args.sampleqc_sd_threshold *
                                             stats.stdev)
                cutoff_lower = stats.mean - (args.sampleqc_sd_threshold *
                                             stats.stdev)

                if measure == "n_singleton":
                    logging.info(
                        f"Max number of singletons for batch {batch}: {stats.max}"
                    )

                mt_cols = mt_cols.annotate(failing_samples_qc=hl.cond(
                    ((mt_cols.sample_qc[measure] > cutoff_upper)
                     | (mt_cols.sample_qc[measure] < cutoff_lower))
                    & hl.is_defined(mt_cols.sample_qc[measure])
                    & (mt_cols[args.batch_col_name] == batch),
                    mt_cols.failing_samples_qc.append(
                        f"failing_{measure}"), mt_cols.failing_samples_qc))

                mt_cols = mt_cols.annotate(boxplot_label=hl.cond(
                    ((mt_cols.sample_qc[measure] > cutoff_upper)
                     | (mt_cols.sample_qc[measure] < cutoff_lower))
                    & hl.is_defined(mt_cols.sample_qc[measure])
                    & (mt_cols[args.batch_col_name] == batch), "outlier",
                    mt_cols.boxplot_label))

                # Collect thresholds and statistics for each batch
                batch_thresholds[measure][batch] = {
                    'min_thresh': cutoff_lower,
                    'max_thresh': cutoff_upper
                }
                batch_statistics[measure][batch] = stats

            else:
                logging.error(
                    f"Error- no defined values for measure {measure}. NAs can be introduced by division by "
                    f"zero. Samples not filtered on {measure}!")

        # Create plot for measure for each batch
        output_file(f"{datestr}_samples_qc_plots_{measure}.html")
        p = hl.plot.scatter(mt_cols.plot_batch_jitter,
                            mt_cols.sample_qc[measure],
                            label=mt_cols.boxplot_label,
                            title=f"{measure} values split by batch.")
        save(p)

    ##########################
    # Report failing samples #
    ##########################
    for measure in [
            'r_ti_tv', 'r_het_hom_var', 'r_insertion_deletion', 'n_singleton'
    ]:
        failing_count = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(f"failing_{measure}")))
        missing_count = mt_cols.aggregate(
            hl.agg.count_where(
                mt_cols.failing_samples_qc.contains(f"missing_{measure}")))
        logging.info(
            f"Number of samples failing on {measure}: {failing_count}")
        logging.info(f"Number of samples missing {measure}: {missing_count}")

    failing_any = mt_cols.aggregate(
        hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0))
    logging.info(
        f"Number of samples failing samples QC on any measure: {failing_any}")

    if args.pheno_col is not None:
        cases_failing = mt_cols.aggregate(
            hl.agg.filter(
                mt_cols[args.pheno_col] == True,
                hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)))
        controls_failing = mt_cols.aggregate(
            hl.agg.filter(
                mt_cols[args.pheno_col] == False,
                hl.agg.count_where(hl.len(mt_cols.failing_samples_qc) != 0)))
        logging.info(f"Cases failing QC: {cases_failing}")
        logging.info(f"Controls failing QC: {controls_failing}")

    #######################################################################################################
    # Annotate original (unfiltered) matrix table with failing samples QC information + sample QC measure #
    #######################################################################################################
    mt_to_annotate = mt_to_annotate.annotate_cols(
        sample_qc=mt_cols[mt_to_annotate.s].sample_qc)
    mt_to_annotate = mt_to_annotate.annotate_cols(
        failing_samples_qc=mt_cols[mt_to_annotate.s].failing_samples_qc)

    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_stats_batches=batch_statistics)
    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_stats_chim_cont={
            'chimeras': chim_stats,
            'contamination': cont_stats
        })
    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_thresholds={
            'chimeras_max': str(args.chimeras_max),
            'contamination_max': str(args.contamination_max),
            'deviation_multiplier_threshold': str(args.sampleqc_sd_threshold),
            'batches': str(batch_set),
            'batch_cohort_name': str(args.batch_col_name)
        })

    mt_to_annotate = mt_to_annotate.annotate_globals(
        samples_qc_batch_thresholds=batch_thresholds)

    return mt_to_annotate
ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"])))

# Group gene lists for all consequences in a struct
ds = ds.annotate(
    consequences=hl.struct(
        **{
            csq.lower(): ds.info[f"PROTEIN_CODING__{csq}"]
            for csq in protein_coding_consequences
            if csq != "INTERGENIC" and csq != "NEAREST_TSS"
        }
    )
)
ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC)

# Collect set of all genes for which a variant has a consequence
all_genes = hl.empty_array(hl.tstr)
for csq in ds.consequences.dtype.fields:
    all_genes = all_genes.extend(
        hl.or_else(ds.consequences[csq.lower()], hl.empty_array(hl.tstr))
    )
ds = ds.annotate(genes=hl.set(all_genes))

# Group per-population values in a struct for each field
def expr_for_per_population_field(row, field):
    return hl.struct(
        **dict(
            ((pop.lower(), row.info[f"{pop}_{field}"]) for pop in populations),
            total=row.info[field],
        )
    )
Ejemplo n.º 31
0
def import_structural_variants(vcf_path):
    ds = hl.import_vcf(vcf_path, force_bgz=True, min_partitions=32).rows()

    ds = ds.annotate(
        **{field.lower(): ds.info[field]
           for field in TOP_LEVEL_INFO_FIELDS})

    ds = ds.annotate(
        variant_id=ds.rsid.replace("^gnomAD-SV_v2.1_", ""),
        reference_genome="GRCh37",
        # Start
        chrom=ds.locus.contig,
        pos=ds.locus.position,
        xpos=x_position(ds.locus.contig, ds.locus.position),
        # End
        end=ds.info.END,
        xend=x_position(ds.locus.contig, ds.info.END),
        # Start 2
        chrom2=ds.info.CHR2,
        pos2=ds.info.POS2,
        xpos2=x_position(ds.info.CHR2, ds.info.POS2),
        # End 2
        end2=ds.info.END2,
        xend2=x_position(ds.info.CHR2, ds.info.END2),
        # Other
        length=ds.info.SVLEN,
        type=ds.info.SVTYPE,
        alts=ds.alleles[1:],
    )

    # MULTIALLELIC should not be used as a quality filter in the browser
    ds = ds.annotate(filters=ds.filters.difference(hl.set(["MULTIALLELIC"])))

    # Group gene lists for all consequences in one field
    ds = ds.annotate(consequences=hl.array([
        hl.struct(
            consequence=csq.lower(),
            genes=hl.or_else(ds.info[f"PROTEIN_CODING__{csq}"],
                             hl.empty_array(hl.tstr)),
        ) for csq in RANKED_CONSEQUENCES
        if csq not in ("INTERGENIC", "NEAREST_TSS")
    ]).filter(lambda csq: hl.len(csq.genes) > 0))
    ds = ds.annotate(intergenic=ds.info.PROTEIN_CODING__INTERGENIC)

    ds = ds.annotate(major_consequence=hl.rbind(
        ds.consequences.find(lambda csq: hl.len(csq.genes) > 0),
        lambda csq: hl.or_else(csq.consequence,
                               hl.or_missing(ds.intergenic, "intergenic")),
    ))

    # Collect set of all genes for which a variant has a consequence
    ds = ds.annotate(genes=hl.set(ds.consequences.flatmap(lambda c: c.genes)))

    # Group per-population frequency values
    ds = ds.annotate(freq=hl.struct(
        **{field.lower(): ds.info[field]
           for field in FREQ_FIELDS},
        populations=[
            hl.struct(id=pop,
                      **{
                          field.lower(): ds.info[f"{pop}_{field}"]
                          for field in FREQ_FIELDS
                      }) for pop in DIVISIONS
        ],
    ))

    # For MCNVs, store per-copy number allele counts
    ds = ds.annotate(freq=ds.freq.annotate(copy_numbers=hl.or_missing(
        ds.type == "MCNV",
        hl.zip_with_index(ds.alts).map(lambda pair: hl.rbind(
            pair[0],
            pair[1],
            lambda index, alt: hl.struct(
                # Extract copy number. Example, get 2 from "CN=<2>"
                copy_number=hl.int(alt[4:-1]),
                ac=ds.freq.ac[index],
            ),
        )),
    )))

    # For MCNVs, sum AC/AF for all alt alleles except CN=2
    ds = ds.annotate(freq=ds.freq.annotate(
        ac=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af(
            ds.alts, ds.freq.ac), ds.freq.ac[0]),
        af=hl.if_else(ds.type == "MCNV", sum_mcnv_ac_or_af(
            ds.alts, ds.freq.af), ds.freq.af[0]),
        populations=hl.if_else(
            ds.type == "MCNV",
            ds.freq.populations.map(lambda pop: pop.annotate(
                ac=sum_mcnv_ac_or_af(ds.alts, pop.ac),
                af=sum_mcnv_ac_or_af(ds.alts, pop.af),
            )),
            ds.freq.populations.map(
                lambda pop: pop.annotate(ac=pop.ac[0], af=pop.af[0])),
        ),
    ))

    # Add hemizygous frequencies
    ds = ds.annotate(hemizygote_count=hl.dict(
        [(
            pop_id,
            hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y"))
                       & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0),
        ) for pop_id in POPULATIONS] +
        [(f"{pop_id}_FEMALE", 0) for pop_id in POPULATIONS] + [(
            f"{pop_id}_MALE",
            hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y"))
                       & ~ds.par, ds.info[f"{pop_id}_MALE_N_HEMIALT"], 0),
        ) for pop_id in POPULATIONS] + [("FEMALE", 0)] +
        [("MALE",
          hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y"))
                     & ~ds.par, ds.info.MALE_N_HEMIALT, 0))]))

    ds = ds.annotate(freq=ds.freq.annotate(
        hemizygote_count=hl.or_missing(
            ds.type != "MCNV",
            hl.if_else(((ds.chrom == "X") | (ds.chrom == "Y"))
                       & ~ds.par, ds.info.MALE_N_HEMIALT, 0),
        ),
        populations=hl.if_else(
            ds.type != "MCNV",
            ds.freq.populations.map(lambda pop: pop.annotate(
                hemizygote_count=ds.hemizygote_count[pop.id])),
            ds.freq.populations.map(
                lambda pop: pop.annotate(hemizygote_count=hl.null(hl.tint))),
        ),
    ))

    ds = ds.drop("hemizygote_count")

    # Rename n_homalt
    ds = ds.annotate(freq=ds.freq.annotate(
        homozygote_count=ds.freq.n_homalt,
        populations=ds.freq.populations.map(lambda pop: pop.annotate(
            homozygote_count=pop.n_homalt).drop("n_homalt")),
    ).drop("n_homalt"))

    # Re-key
    ds = ds.key_by("variant_id")

    ds = ds.drop("locus", "alleles", "info", "rsid")

    return ds
def import_gnomad_v2_lof_curation_results(curation_result_paths, genes_path):
    all_flags = set()

    with hl.hadoop_open("/tmp/import_temp.tsv", "w") as temp_output_file:
        writer = csv.writer(temp_output_file, delimiter="\t", quotechar='"')
        writer.writerow(["chrom", "position", "ref", "alt", "genes", "verdict", "flags", "project_index"])

        for project_index, path in enumerate(curation_result_paths):
            with hl.hadoop_open(path, "r") as input_file:
                reader = csv.DictReader(input_file)

                raw_dataset_flags = [f.lstrip("Flag ") for f in reader.fieldnames if f.startswith("Flag ")]

                dataset_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags]

                all_flags = all_flags.union(set(dataset_flags))

                for row in reader:
                    [chrom, pos, ref, alt] = row["Variant ID"].split("-")

                    variant_flags = [FLAG_MAPPING.get(f, f) for f in raw_dataset_flags if row[f"Flag {f}"] == "TRUE"]

                    genes = [gene_id for (gene_id, gene_symbol) in (gene.split(":") for gene in row["Gene"].split(";"))]

                    verdict = row["Verdict"]

                    if verdict == "inufficient_evidence":
                        verdict = "insufficient_evidence"

                    verdict = VERDICT_MAPPING[verdict]

                    output_row = [
                        chrom,
                        pos,
                        ref,
                        alt,
                        ",".join(genes),
                        verdict,
                        ",".join(variant_flags),
                        project_index,
                    ]

                    writer.writerow(output_row)

    ds = hl.import_table("/tmp/import_temp.tsv")

    ds = ds.transmute(locus=hl.locus(ds.chrom, hl.int(ds.position)), alleles=[ds.ref, ds.alt],)

    ds = ds.annotate(
        genes=ds.genes.split(","),
        flags=hl.set(hl.if_else(ds.flags == "", hl.empty_array(hl.tstr), ds.flags.split(","))),
    )

    ds = ds.explode(ds.genes, name="gene_id")

    genes = hl.read_table(genes_path)
    ds = ds.annotate(gene_symbol=genes[ds.gene_id].symbol, gene_version=genes[ds.gene_id].gene_version)

    ds = ds.group_by(ds.locus, ds.alleles, ds.gene_id).aggregate(
        result=hl.agg.take(ds.row.drop("locus", "alleles", "gene_id"), 1, ds.project_index)
    )

    ds = ds.annotate(**ds.result[0]).drop("result", "project_index")

    ds = ds.group_by("locus", "alleles").aggregate(lof_curations=hl.agg.collect(ds.row.drop("locus", "alleles")))

    ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles))

    for flag in sorted(list(all_flags)):
        print(flag)

    return ds
Ejemplo n.º 33
0
omim = omim.select(
    gene_id=omim["Ensembl Gene ID"],
    omim_accession=omim["MIM Gene Accession"],
    omim_description=omim["MIM Gene Description"],
)
omim = omim.key_by("gene_id")
genes = genes.annotate(**omim[genes.gene_id])

# Full names
dbnsfp = hl.import_table(args.dbnsfp_file, missing=".")
dbnsfp = dbnsfp.select(
    gene_id=dbnsfp["Ensembl_gene"],
    full_gene_name=dbnsfp["Gene_full_name"],
    other_names=hl.or_else(
        dbnsfp["Gene_old_names"].upper().split(";"),
        hl.empty_array(hl.tstr)).extend(
            hl.or_else(dbnsfp["Gene_other_names"].upper().split(";"),
                       hl.empty_array(hl.tstr))),
)
dbnsfp = dbnsfp.key_by("gene_id")
genes = genes.annotate(**dbnsfp[genes.gene_id])

genes.key_by().write(os.path.join(args.output_directory, "genes.ht"))

###############################################
# Transcripts                                 #
###############################################

transcripts = gencode.filter(gencode.feature == "transcript")
transcripts = transcripts.select(
    transcript_id=transcripts.transcript_id.split("\\.")[0],
Ejemplo n.º 34
0
def vep_struct_to_csq(
        vep_expr: hl.expr.StructExpression,
        csq_fields: str = VEP_CSQ_FIELDS) -> hl.expr.ArrayExpression:
    """
    Given a VEP Struct, returns and array of VEP VCF CSQ strings (one per consequence in the struct).

    The fields and their order will correspond to those passed in `csq_fields`, which corresponds to the
    VCF header that is required to interpret the VCF CSQ INFO field.

    Note that the order is flexible and that all fields that are in the default value are supported.
    These fields will be formatted in the same way that their VEP CSQ counterparts are.

    While other fields can be added if their name are the same as those in the struct. Their value will be the result of calling
    hl.str(), so it may differ from their usual VEP CSQ representation.

    :param vep_expr: The input VEP Struct
    :param csq_fields: The | delimited list of fields to include in the CSQ (in that order)
    :return: The corresponding CSQ strings
    """

    _csq_fields = [f.lower() for f in csq_fields.split("|")]

    def get_csq_from_struct(element: hl.expr.StructExpression,
                            feature_type: str) -> hl.expr.StringExpression:
        # Most fields are 1-1, just lowercase
        fields = dict(element)

        # Add general exceptions
        fields.update({
            "allele":
            element.variant_allele,
            "consequence":
            hl.delimit(element.consequence_terms, delimiter="&"),
            "feature_type":
            feature_type,
            "feature":
            (element.transcript_id if "transcript_id" in element else
             element.regulatory_feature_id if "regulatory_feature_id"
             in element else element.motif_feature_id
             if "motif_feature_id" in element else ""),
            "variant_class":
            vep_expr.variant_class,
        })

        # Add exception for transcripts
        if feature_type == "Transcript":
            fields.update({
                "canonical":
                hl.cond(element.canonical == 1, "YES", ""),
                "ensp":
                element.protein_id,
                "gene":
                element.gene_id,
                "symbol":
                element.gene_symbol,
                "symbol_source":
                element.gene_symbol_source,
                "cdna_position":
                hl.str(element.cdna_start) + hl.cond(
                    element.cdna_start == element.cdna_end,
                    "",
                    "-" + hl.str(element.cdna_end),
                ),
                "cds_position":
                hl.str(element.cds_start) + hl.cond(
                    element.cds_start == element.cds_end,
                    "",
                    "-" + hl.str(element.cds_end),
                ),
                "protein_position":
                hl.str(element.protein_start) + hl.cond(
                    element.protein_start == element.protein_end,
                    "",
                    "-" + hl.str(element.protein_end),
                ),
                "sift":
                element.sift_prediction + "(" +
                hl.format("%.3f", element.sift_score) + ")",
                "polyphen":
                element.polyphen_prediction + "(" +
                hl.format("%.3f", element.polyphen_score) + ")",
                "domains":
                hl.delimit(element.domains.map(lambda d: d.db + ":" + d.name),
                           "&"),
            })
        elif feature_type == "MotifFeature":
            fields["motif_score_change"] = hl.format(
                "%.3f", element.motif_score_change)

        return hl.delimit(
            [hl.or_else(hl.str(fields.get(f, "")), "") for f in _csq_fields],
            "|")

    csq = hl.empty_array(hl.tstr)
    for feature_field, feature_type in [
        ("transcript_consequences", "Transcript"),
        ("regulatory_feature_consequences", "RegulatoryFeature"),
        ("motif_feature_consequences", "MotifFeature"),
        ("intergenic_consequences", "Intergenic"),
    ]:
        csq = csq.extend(
            hl.or_else(
                vep_expr[feature_field].map(lambda x: get_csq_from_struct(
                    x, feature_type=feature_type)),
                hl.empty_array(hl.tstr),
            ))

    return hl.or_missing(hl.len(csq) > 0, csq)
Ejemplo n.º 35
0
def prepare_gnomad_v3_variants(path):
    ds = hl.read_table(path)

    g = hl.eval(ds.globals)

    subsets = set(m.get("subset", None) for m in g.freq_meta)

    def freq(ds, *args, **kwargs):
        return ds.freq[g.freq_index_dict[freq_index_key(*args, **kwargs)]]

    ############################
    # Derived top level fields #
    ############################

    ds = ds.annotate(variant_id=variant_id(ds.locus, ds.alleles))
    ds = ds.rename({"rsid": "rsids"})

    ######################
    # Colocated variants #
    ######################

    variants_by_locus = ds.select(
        ds.variant_id,
        ac_raw=hl.struct(
            **{
                subset or "all": freq(ds, subset=subset, raw=True).AC
                for subset in subsets
            }),
    )
    variants_by_locus = variants_by_locus.group_by("locus").aggregate(
        variants=hl.agg.collect(variants_by_locus.row_value))

    def subset_filter(subset):
        return lambda variant: variant.ac_raw[subset] > 0

    variants_by_locus = variants_by_locus.annotate(variant_ids=hl.struct(
        **{
            subset or "all": variants_by_locus.variants.filter(
                subset_filter(subset or "all")).map(
                    lambda variant: variant.variant_id)
            for subset in subsets
        }))

    ds = ds.annotate(
        colocated_variants=variants_by_locus[ds.locus].variant_ids)
    ds = ds.annotate(colocated_variants=hl.struct(
        **{
            subset: ds.colocated_variants[subset].filter(
                lambda variant_id: variant_id != ds.variant_id)
            for subset in ds.colocated_variants._fields
        }))

    ###############
    # Frequencies #
    ###############

    subset_populations = {}
    for subset in subsets:
        subset_populations[subset] = set(
            m.get("pop", None) for m in g.freq_meta
            if m.get("subset", None) == subset)

        subset_populations[subset].discard(None)

        # "global" population is used for downsamplings
        subset_populations[subset].discard("global")

    ds = ds.annotate(in_autosome_or_par=ds.locus.in_autosome_or_par())

    ds = ds.annotate(genome=hl.struct(freq=hl.struct(
        **{
            subset or "all": hl.struct(
                ac=freq(ds, subset=subset).AC,
                ac_raw=freq(ds, subset=subset, raw=True).AC,
                an=freq(ds, subset=subset).AN,
                hemizygote_count=hl.if_else(
                    ds.in_autosome_or_par, 0,
                    hl.or_else(freq(ds, subset=subset, sex="XY").AC, 0)),
                homozygote_count=freq(ds, subset=subset).homozygote_count,
                populations=[
                    hl.struct(
                        id="_".join(filter(bool, [pop, sex])),
                        ac=hl.or_else(
                            freq(ds, subset=subset, pop=pop, sex=sex).AC, 0),
                        an=hl.or_else(
                            freq(ds, subset=subset, pop=pop, sex=sex).AN, 0),
                        hemizygote_count=0 if sex == "XX" else hl.if_else(
                            ds.in_autosome_or_par,
                            0,
                            hl.or_else(
                                freq(ds, subset=subset, pop=pop, sex="XY").
                                AC, 0),
                        ),
                        homozygote_count=hl.or_else(
                            freq(ds, subset=subset, pop=pop,
                                 sex=sex).homozygote_count, 0),
                    ) for pop, sex in list(
                        itertools.product(subset_populations[subset],
                                          [None, "XX", "XY"])) +
                    [(None, "XX"), (None, "XY")]
                ],
            )
            for subset in subsets
        })))

    # If a variant is not present in a subset, do not store population frequencies for that subset
    ds = ds.annotate(genome=ds.genome.annotate(freq=ds.genome.freq.annotate(
        **{
            subset or "all": ds.genome.freq[subset or "all"].annotate(
                populations=hl.if_else(
                    ds.genome.freq[subset or "all"].ac_raw == 0,
                    hl.empty_array(ds.genome.freq[
                        subset or "all"].populations.dtype.element_type),
                    ds.genome.freq[subset or "all"].populations,
                ))
            for subset in subsets
        })))

    ds = ds.drop("freq", "in_autosome_or_par")

    ###########################################
    # Subsets in which the variant is present #
    ###########################################

    ds = ds.annotate(subsets=hl.set(
        hl.array([(subset, ds.genome.freq[subset].ac_raw > 0)
                  for subset in subsets if subset is not None]).filter(
                      lambda t: t[1]).map(lambda t: t[0])))

    ##############################
    # Filtering allele frequency #
    ##############################

    faf_populations = [
        pop for pop in subset_populations[None]
        if f"{pop}-adj" in g.faf_index_dict
    ]

    # Get popmax FAFs
    ds = ds.annotate(genome=ds.genome.annotate(
        faf95=hl.rbind(
            hl.sorted(
                hl.array([
                    hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf95,
                              population=pop) for pop in faf_populations
                ]),
                key=lambda f: (-f.faf, f.population),
            ),
            lambda fafs: hl.if_else(
                hl.len(fafs) > 0,
                hl.struct(popmax=fafs[0].faf,
                          popmax_population=fafs[0].population),
                hl.struct(popmax=hl.null(hl.tfloat),
                          popmax_population=hl.null(hl.tstr)),
            ),
        ),
        faf99=hl.rbind(
            hl.sorted(
                hl.array([
                    hl.struct(faf=ds.faf[g.faf_index_dict[f"{pop}-adj"]].faf99,
                              population=pop) for pop in faf_populations
                ]),
                key=lambda f: (-f.faf, f.population),
            ),
            lambda fafs: hl.if_else(
                hl.len(fafs) > 0,
                hl.struct(popmax=fafs[0].faf,
                          popmax_population=fafs[0].population),
                hl.struct(popmax=hl.null(hl.tfloat),
                          popmax_population=hl.null(hl.tstr)),
            ),
        ),
    ))

    ds = ds.drop("faf")

    ####################
    # Age distribution #
    ####################

    ds = ds.annotate(genome=ds.genome.annotate(
        age_distribution=hl.struct(het=ds.age_hist_het, hom=ds.age_hist_hom)))

    ds = ds.drop("age_hist_het", "age_hist_hom")

    ###################
    # Quality metrics #
    ###################

    ds = ds.annotate(genome=ds.genome.annotate(
        filters=ds.filters,
        quality_metrics=hl.struct(
            allele_balance=hl.struct(
                alt_adj=ds.qual_hists.ab_hist_alt.annotate(
                    bin_edges=ds.qual_hists.ab_hist_alt.bin_edges.map(
                        lambda n: hl.float(hl.format("%.3f", n)))),
                alt_raw=ds.raw_qual_hists.ab_hist_alt.annotate(
                    bin_edges=ds.raw_qual_hists.ab_hist_alt.bin_edges.map(
                        lambda n: hl.float(hl.format("%.3f", n)))),
            ),
            genotype_depth=hl.struct(
                all_adj=ds.qual_hists.dp_hist_all,
                all_raw=ds.raw_qual_hists.dp_hist_all,
                alt_adj=ds.qual_hists.dp_hist_alt,
                alt_raw=ds.raw_qual_hists.dp_hist_alt,
            ),
            genotype_quality=hl.struct(
                all_adj=ds.qual_hists.gq_hist_all,
                all_raw=ds.raw_qual_hists.gq_hist_all,
                alt_adj=ds.qual_hists.gq_hist_alt,
                alt_raw=ds.raw_qual_hists.gq_hist_alt,
            ),
            site_quality_metrics=[
                hl.struct(metric="SiteQuality",
                          value=hl.float(nullify_nan(ds.info.QUALapprox)))
            ] + [
                hl.struct(metric=metric,
                          value=hl.float(nullify_nan(ds.info[metric])))
                for metric in [
                    "InbreedingCoeff",
                    "AS_FS",
                    "AS_MQ",
                    "AS_MQRankSum",
                    "AS_pab_max",
                    "AS_QUALapprox",
                    "AS_QD",
                    "AS_ReadPosRankSum",
                    "AS_SOR",
                    "AS_VarDP",
                    "AS_VQSLOD",
                ]
            ],
        ),
    ))

    ds = ds.drop("filters", "qual_hists", "raw_qual_hists", "vqsr")

    #########
    # Flags #
    #########

    ds = ds.annotate(flags=hl.set([
        hl.or_missing(ds.region_flag.lcr, "lcr"),
        hl.or_missing(ds.region_flag.segdup, "segdup"),
        hl.or_missing(
            ((ds.locus.contig == "chrX") & ds.locus.in_x_par())
            | ((ds.locus.contig == "chrY") & ds.locus.in_y_par()),
            "par",
        ),
        hl.or_missing(ds.info.monoallelic, "monoallelic"),
    ]).filter(hl.is_defined))

    ds = ds.drop("region_flag")

    ########################
    # In silico predictors #
    ########################

    ds = ds.transmute(in_silico_predictors=hl.struct(cadd=ds.cadd,
                                                     primate_ai=ds.primate_ai,
                                                     revel=ds.revel,
                                                     splice_ai=ds.splice_ai))

    ################
    # Other fields #
    ################

    # Drop unused fields
    ds = ds.drop("allele_info", "a_index", "info", "popmax", "was_split")

    return ds