Esempi in Python per set, esempi in Python per hail.set

Esempio n. 1

0

Mostra file

File: test_table.py Progetto: lfrancioli/hail

 def test_explode_on_set(self):
     t = hl.utils.range_table(1)
     t = t.annotate(a=hl.set(['a', 'b', 'c']))
     t = t.explode('a')
     self.assertEqual(set(t.collect()),
                      hl.eval(hl.set([hl.struct(idx=0, a='a'),
                                      hl.struct(idx=0, a='b'),
                                      hl.struct(idx=0, a='c')])))

Esempio n. 2

0

Mostra file

File: test_table.py Progetto: knguyen142/hail

 def test_explode_on_set(self):
     t = hl.utils.range_table(1)
     t = t.annotate(a=hl.set(['a', 'b', 'c']))
     t = t.explode('a')
     self.assertEqual(set(t.collect()),
                      hl.eval(hl.set([hl.struct(idx=0, a='a'),
                                      hl.struct(idx=0, a='b'),
                                      hl.struct(idx=0, a='c')])))

Esempio n. 3

0

Mostra file

    def test_group_by_field_lifetimes(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt2 = (mt.group_rows_by(row_idx='100').aggregate(
            x=hl.agg.collect_as_set(mt.row_idx + 5)))
        assert mt2.aggregate_entries(hl.agg.all(mt2.x == hl.set({5, 6, 7})))

        mt3 = (mt.group_cols_by(col_idx='100').aggregate(
            x=hl.agg.collect_as_set(mt.col_idx + 5)))
        assert mt3.aggregate_entries(hl.agg.all(mt3.x == hl.set({5, 6, 7})))

Esempio n. 4

0

Mostra file

File: test_matrix_table.py Progetto: tpoterba/hail

    def test_group_by_field_lifetimes(self):
        mt = hl.utils.range_matrix_table(3, 3)
        mt2 = (mt.group_rows_by(row_idx='100')
               .aggregate(x=hl.agg.collect_as_set(mt.row_idx + 5)))
        assert mt2.aggregate_entries(hl.agg.all(mt2.x == hl.set({5, 6, 7})))

        mt3 = (mt.group_cols_by(col_idx='100')
               .aggregate(x=hl.agg.collect_as_set(mt.col_idx + 5)))
        assert mt3.aggregate_entries(hl.agg.all(mt3.x == hl.set({5, 6, 7})))

Esempio n. 5

0

Mostra file

File: test_impex.py Progetto: lfrancioli/hail

    def test_multiple_files_variant_filtering(self):
        bgen_file = [resource('random-b.bgen'), resource('random-c.bgen'), resource('random-a.bgen')]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file,
                                    ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))

Esempio n. 6

0

Mostra file

File: annotations.py Progetto: ksamocha/gnomad_methods

def pop_max_expr(
    freq: hl.expr.ArrayExpression,
    freq_meta: hl.expr.ArrayExpression,
    pops_to_exclude: Optional[Set[str]] = None,
) -> hl.expr.StructExpression:
    """
    Creates an expression containing popmax: the frequency information about the population
    that has the highest AF from the populations provided in `freq_meta`,
    excluding those specified in `pops_to_exclude`.
    Only frequencies from adj populations are considered.

    This resulting struct contains the following fields:

        - AC: int32
        - AF: float64
        - AN: int32
        - homozygote_count: int32
        - pop: str

    :param freq: ArrayExpression of Structs with fields ['AC', 'AF', 'AN', 'homozygote_count']
    :param freq_meta: ArrayExpression of meta dictionaries corresponding to freq (as returned by annotate_freq)
    :param pops_to_exclude: Set of populations to skip for popmax calcluation

    :return: Popmax struct
    """
    _pops_to_exclude = hl.literal(pops_to_exclude)
    popmax_freq_indices = hl.range(0, hl.len(freq_meta)).filter(
        lambda i: (hl.set(freq_meta[i].keys()) == {"group", "pop"})
        & (freq_meta[i]["group"] == "adj")
        & (~_pops_to_exclude.contains(freq_meta[i]["pop"])))
    freq_filtered = popmax_freq_indices.map(lambda i: freq[i].annotate(
        pop=freq_meta[i]["pop"])).filter(lambda f: f.AC > 0)

    sorted_freqs = hl.sorted(freq_filtered, key=lambda x: x.AF, reverse=True)
    return hl.or_missing(hl.len(sorted_freqs) > 0, sorted_freqs[0])

Esempio n. 7

0

Mostra file

 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))

Esempio n. 8

0

Mostra file

def filter_to_clinvar_pathogenic(
    t: Union[hl.MatrixTable, hl.Table],
    clnrevstat_field: str = "CLNREVSTAT",
    clnsig_field: str = "CLNSIG",
    clnsigconf_field: str = "CLNSIGCONF",
    remove_no_assertion: bool = True,
    remove_conflicting: bool = True,
) -> Union[hl.MatrixTable, hl.Table]:
    """
    Return a MatrixTable or Table that filters the clinvar data to pathogenic and likely pathogenic variants.

    Example use:

    .. code-block:: python

        from gnomad.resources.grch38.reference_data import clinvar
        clinvar_ht = clinvar.ht()
        clinvar_ht = filter_to_clinvar_pathogenic(clinvar_ht)

    :param: t: Input dataset that contains clinvar data, could either be a MatrixTable or Table.
    :param clnrevstat_field: The field string for the expression that contains the review status of the clinical significance of clinvar variants.
    :param clnsig_field: The field string for the expression that contains the clinical signifcance of the clinvar variant.
    :param clnsigconf_field: The field string for the expression that contains the conflicting clinical significance values for the variant. For variants with no conflicting significance, this field should be undefined.
    :param remove_no_assertion: Flag for removing entries in which the clnrevstat (clinical significance) has no assertions (zero stars).
    :param remove_conflicting: Flag for removing entries with conflicting clinical interpretations.
    :return: Filtered MatrixTable or Table
    """
    logger.info(
        "Found %d variants before filtering",
        t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(),
    )
    path_expr = (t.info[clnsig_field].map(lambda x: x.lower()).map(
        lambda x: x.contains("pathogenic")).any(lambda x: x))

    if remove_no_assertion:
        logger.info("Variants without assertions will be removed.")
        no_star_assertions = hl.literal({
            "no_assertion_provided",
            "no_assertion_criteria_provided",
            "no_interpretation_for_the_individual_variant",
        })
        path_expr = path_expr & (hl.set(t.info[clnrevstat_field]).intersection(
            no_star_assertions).length() == 0)

    if remove_conflicting:
        logger.info(
            "Variants with conflicting clinical interpretations will be removed."
        )
        path_expr = path_expr & hl.is_missing(t.info[clnsigconf_field])

    if isinstance(t, hl.MatrixTable):
        t = t.filter_rows(path_expr)
    else:
        t = t.filter(path_expr)

    logger.info(
        "Found %d variants after filtering to clinvar pathogenic variants.",
        t.count_rows() if isinstance(t, hl.MatrixTable) else t.count(),
    )
    return t

Esempio n. 9

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument(
        "--base-level-pext",
        help="Path to Hail table with base-level data",
        default=
        "gs://gnomad-public/papers/2019-tx-annotation/gnomad_browser/all.baselevel.021620.ht",
    )
    parser.add_argument(
        "--low-max-pext-genes",
        help="Path to table containing list of genes with low max pext",
        default=
        "gs://gnomad-public/papers/2019-tx-annotation/data/GRCH37_hg19/max_pext_low_genes.021520.tsv",
    )
    parser.add_argument(
        "output_path", help="Path to output Hail table with region-level data")
    args = parser.parse_args()

    ds = prepare_pext_data(args.base_level_pext)

    low_max_pext_genes = hl.import_table(args.low_max_pext_genes)
    low_max_pext_genes = low_max_pext_genes.aggregate(
        hl.agg.collect_as_set(low_max_pext_genes.ensg))
    ds = ds.annotate(flags=hl.cond(
        hl.set(low_max_pext_genes).contains(ds.gene_id),
        hl.literal(["low_max_pext"]),
        hl.empty_array(hl.tstr),
    ))

    ds.write(args.output_path)

Esempio n. 10

0

Mostra file

def get_mt_filtered_by_pops(pops: list,
                            chrom: str = 'all',
                            imputed: bool = True,
                            min_mac: int = 20,
                            entry_fields=('GP', ),
                            filter_mac_instead_of_ac: bool = False):
    r'''
    Wraps `get_filtered_mt()` from ukbb_pan_ancestry.resources.genotypes
    This filters to samples from populations listed in `pops`.

    NOTE: If chrom='all', this loads all autosomes AND chrX.
    
    '''
    assert len(pops) > 0 and set(pops).issubset(POPS)

    kwargs = {
        'pop': 'all' if len(pops) > 1 else pops[0],
        'imputed': imputed,
        'min_mac': min_mac,
        'entry_fields': entry_fields,
        'filter_mac_instead_of_ac': filter_mac_instead_of_ac
    }

    mt = get_filtered_mt(chrom=chrom,
                         **kwargs)  # in this case chrom='all' gets autosomes

    if chrom == 'all':
        mt_x = get_filtered_mt(chrom='X', **kwargs)
        mt = mt.union_rows(mt_x)

    if len(pops) > 1:
        mt = mt.filter_cols(hl.set(pops).contains(mt.pop))

    return mt

Esempio n. 11

0

Mostra file

File: vcf_combiner.py Progetto: jigold/hail

 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))

Esempio n. 12

0

Mostra file

    def test_multiple_files_variant_filtering(self):
        bgen_file = [
            resource('random-b.bgen'),
            resource('random-c.bgen'),
            resource('random-a.bgen')
        ]
        hl.index_bgen(bgen_file)

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('20', 11), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 13), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 29), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 28), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 1), alleles=alleles),
            hl.Struct(locus=hl.Locus('20', 12), alleles=alleles),
        ]

        actual = hl.import_bgen(bgen_file, ['GT'],
                                n_partitions=10,
                                variants=desired_variants)
        self.assertEqual(actual.count_rows(), 6)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (30, 10))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(actual))

Esempio n. 13

0

Mostra file

File: prepare_gene_models.py Progetto: sachalau/gnomad-browser

def collect_gene_exons(gene_exons):
    # There are 3 feature types in the exons collection: "CDS", "UTR", and "exon".
    # There are "exon" regions that cover the "CDS" and "UTR" regions and also
    # some (non-coding) transcripts that contain only "exon" regions.
    # This filters the "exon" regions to only those that are in non-coding transcripts.
    #
    # This makes the UI for selecting visible regions easier, since it can filter
    # on "CDS" or "UTR" feature type without having to also filter out the "exon" regions
    # that duplicate the "CDS" and "UTR" regions.

    non_coding_transcript_exons = hl.bind(
        lambda coding_transcripts: gene_exons.filter(
            lambda exon: ~coding_transcripts.contains(exon.transcript_id)),
        hl.set(
            gene_exons.filter(lambda exon: (exon.feature_type == "CDS") |
                              (exon.feature_type == "UTR")).map(
                                  lambda exon: exon.transcript_id)),
    )

    exons = (merge_overlapping_regions(
        gene_exons.filter(lambda exon: exon.feature_type == "CDS")).extend(
            merge_overlapping_regions(
                gene_exons.filter(lambda exon: exon.feature_type == "UTR"))).
             extend(merge_overlapping_regions(non_coding_transcript_exons)))

    exons = exons.map(
        lambda exon: exon.select("feature_type",
                                 "start",
                                 "stop",
                                 xstart=xpos(exon.chrom, exon.start),
                                 xstop=xpos(exon.chrom, exon.stop)))

    return exons

Esempio n. 14

0

Mostra file

 def make_pop_filters_expr(ht: hl.Table,
                           qc_metrics: List[str]) -> hl.expr.SetExpression:
     return hl.set(
         hl.filter(lambda x: hl.is_defined(x), [
             hl.or_missing(ht[f'fail_{metric}'], metric)
             for metric in qc_metrics
         ]))

Esempio n. 15

0

Mostra file

def make_hard_filters_expr(ht: hl.Table,
                           data_type: str) -> hl.expr.SetExpression:
    """
    NOTE: additional metadata in Kristen's import file is hard-coded

    :param: Table ht: input MT
    :param: str data_type: 'exomes' or 'genomes'
    :return: output MT
    :rtype: SetExpression
    """
    hard_filters = {
        'contamination': ht.freemix > 0.05,
        'callrate': ht.callrate < 0.85,
        'chimera': ht.pct_chimeras > 0.05,
        'ambiguous_sex': ht.ambiguous_sex
    }

    if data_type == 'exomes':
        hard_filters.update({
            'coverage': ht.mean_chr20_coverage == 0,
            'sex_aneuploidy': ht.sex_aneuploidy
        })
    else:
        hard_filters.update({
            'coverage': ht.mean_dp < 15,
            'insert_size': ht.median_insert_size < 250
        })
    return hl.set(
        hl.filter(lambda x: hl.is_defined(x), [
            hl.or_missing(filter_expr, name)
            for name, filter_expr in hard_filters.items()
        ]))

Esempio n. 16

0

Mostra file

File: combine_vcfs.py Progetto: leklab/sfari_mito_calling

def apply_mito_artifact_filter(
    mt: hl.MatrixTable,
    artifact_prone_sites_path: str,
) -> hl.MatrixTable:
    """Add back in artifact_prone_site filter

    :param hl.MatrixTable mt: MatrixTable to use an input
    :param str artifact_prone_sites_path: path to BED file of artifact_prone_sites to flag in the filters column

    :return: MatrixTable with artifact_prone_sites filter
    :rtype: hl.MatrixTable

    """

    # apply "artifact_prone_site" filter to any SNP or deletion that spans a known problematic site
    mt = mt.annotate_rows(
        position_range=hl.range(mt.locus.position, mt.locus.position +
                                hl.len(mt.alleles[0])))

    artifact_sites = []
    with hl.hadoop_open(artifact_prone_sites_path) as f:
        for line in f:
            pos = line.split()[2]
            artifact_sites.append(int(pos))
    sites = hl.literal(set(artifact_sites))

    mt = mt.annotate_rows(filters=hl.if_else(
        hl.len(hl.set(mt.position_range).intersection(sites)) > 0,
        {"artifact_prone_site"},
        {"PASS"},
    ))

    mt = mt.drop("position_range")

    return mt

Esempio n. 17

0

Mostra file

File: combine_vcfs.py Progetto: leklab/sfari_mito_calling

def remove_FT_values(
    mt: hl.MatrixTable,
    filters_to_remove: list = [
        'possible_numt', 'mt_many_low_hets', 'FAIL', 'blacklisted_site'
    ]
) -> hl.MatrixTable:
    """Removes the FT filters specified in filters_to_remove
    
    By default, this function removes the 'possible_numt', 'mt_many_low_hets', and 'FAIL' filters (because these filters were found to have low performance), 
    and the 'blacklisted_site' filter because this filter did not always behave as expected in early GATK versions (can be replaced with apply_mito_artifact_filter function)

    :param hl.MatrixTable mt:  MatrixTable
    :param list filters_to_remove: list of FT filters that should be removed from the entries
    
    :return: MatrixTable with certain FT filters removed
    :rtype: MatrixTable
    """

    filters_to_remove = hl.set(filters_to_remove)
    mt = mt.annotate_entries(
        FT=hl.array((mt.FT).difference(filters_to_remove)))

    # if no filters exists after removing those specified above, set the FT field to PASS
    mt = mt.annotate_entries(
        FT=hl.if_else(hl.len(mt.FT) == 0, ["PASS"], mt.FT))

    return (mt)

Esempio n. 18

0

Mostra file

File: vcf_combiner.py Progetto: bcajes/hail

def combine(ts):
    # pylint: disable=protected-access
    tmp = ts.annotate(
        alleles=merge_alleles(ts.data.map(lambda d: d.alleles)),
        rsid=hl.find(hl.is_defined, ts.data.map(lambda d: d.rsid)),
        filters=hl.set(hl.flatten(ts.data.map(lambda d: hl.array(d.filters)))),
        info=hl.struct(
            DP=hl.sum(ts.data.map(lambda d: d.info.DP)),
            MQ_DP=hl.sum(ts.data.map(lambda d: d.info.MQ_DP)),
            QUALapprox=hl.sum(ts.data.map(lambda d: d.info.QUALapprox)),
            RAW_MQ=hl.sum(ts.data.map(lambda d: d.info.RAW_MQ)),
            VarDP=hl.sum(ts.data.map(lambda d: d.info.VarDP)),
            SB=hl.array([
                hl.sum(ts.data.map(lambda d: d.info.SB[0])),
                hl.sum(ts.data.map(lambda d: d.info.SB[1])),
                hl.sum(ts.data.map(lambda d: d.info.SB[2])),
                hl.sum(ts.data.map(lambda d: d.info.SB[3]))
            ])))
    tmp = tmp.annotate(
        __entries=hl.bind(
            lambda combined_allele_index:
            hl.range(0, hl.len(tmp.data)).flatmap(
                lambda i:
                hl.cond(hl.is_missing(tmp.data[i].__entries),
                        hl.range(0, hl.len(tmp.g[i].__cols))
                          .map(lambda _: hl.null(tmp.data[i].__entries.dtype.element_type)),
                        hl.bind(
                            lambda old_to_new: tmp.data[i].__entries.map(lambda e: renumber_entry(e, old_to_new)),
                            hl.range(0, hl.len(tmp.data[i].alleles)).map(
                                lambda j: combined_allele_index[tmp.data[i].alleles[j]])))),
            hl.dict(hl.range(0, hl.len(tmp.alleles)).map(
                lambda j: hl.tuple([tmp.alleles[j], j])))))
    tmp = tmp.annotate_globals(__cols=hl.flatten(tmp.g.map(lambda g: g.__cols)))

    return tmp.drop('data', 'g')

Esempio n. 19

0

Mostra file

def add_filters_expr(
    filters: Dict[str, hl.expr.BooleanExpression],
    current_filters: hl.expr.SetExpression = None,
) -> hl.expr.SetExpression:
    """
    Create an expression to create or add filters.

    For each entry in the `filters` dictionary, if the value evaluates to `True`,
    then the key is added as a filter name.

    Current filters are kept if provided using `current_filters`

    :param filters: The filters and their expressions
    :param current_filters: The set of current filters
    :return: An expression that can be used to annotate the filters
    """
    if current_filters is None:
        current_filters = hl.empty_set(hl.tstr)

    return hl.fold(
        lambda x, y: x.union(y),
        current_filters,
        [
            hl.cond(filter_condition, hl.set([filter_name]),
                    hl.empty_set(hl.tstr))
            for filter_name, filter_condition in filters.items()
        ],
    )

Esempio n. 20

0

Mostra file

def make_perm_filters_expr(ht: hl.Table,
                           data_type: str) -> hl.expr.SetExpression:
    """
    NOTE: syndip will remain dropped wrt to permissions, but all possible QC measures will still be calculated

    :param Table ht: input MT
    :param str data_type: 'exomes' or 'genomes'
    :return: output MT
    :rtype: SetExpression
    """
    if data_type == 'genomes':
        perm_filters = {'not_releasable': ~ht.releasable_2_1}
    else:
        perm_filters = {
            'tcga_tumor': ht.tcga_tumor,
            'tcga_barcode': ht.tcga_weird_barcode,
            'tcga_below_30': ht.tcga_below_30,
            'specific_exclusion': ht.specific_exclusion,
            'esp': ht.esp,
            'not_releasable': ht.non_releasable,
            'syndip': ht.syndip
        }
    return hl.set(
        hl.filter(lambda x: hl.is_defined(x), [
            hl.or_missing(filter_expr, name)
            for name, filter_expr in perm_filters.items()
        ]))

Esempio n. 21

0

Mostra file

File: test_impex.py Progetto: maccum/hail

    def test_import_bgen_variant_filtering(self):
        desired_variant_indexes = [1, 2, 3, 5, 7, 9, 11, 13, 17, 198]
        actual = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                contig_recoding={'01': '1'},
                                reference_genome=None,
                                n_partitions=10,
                                _row_fields=['file_row_idx'],
                                _variants_per_file={
                                    resource('example.8bits.bgen'):
                                    desired_variant_indexes
                                })
        # doing the expected import_bgen second catches the case where the
        # hadoop configuraiton is polluted with old data from the
        # _variants_per_file
        everything = hl.import_bgen(resource('example.8bits.bgen'), ['GT'],
                                    contig_recoding={'01': '1'},
                                    reference_genome=None,
                                    _row_fields=['file_row_idx'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variant_indexes).contains(
                hl.int32(everything.file_row_idx)))

        self.assertTrue(expected._same(actual))
        self.assertEqual(
            (hl.str(actual.locus.contig) + ":" +
             hl.str(actual.locus.position)).collect(), [
                 '1:3000', '1:4000', '1:5000', '1:7000', '1:9000', '1:11000',
                 '1:13000', '1:15000', '1:19000', '1:100001'
             ])

Esempio n. 22

0

Mostra file

File: chet_utils.py Progetto: broadinstitute/gnomad_chets

def vep_genes_expr(vep_expr: hl.expr.StructExpression,
                   least_consequence: str) -> hl.expr.SetExpression:
    vep_consequences = hl.literal(
        set(CSQ_ORDER[0:CSQ_ORDER.index(least_consequence) + 1]))
    return (hl.set(
        vep_expr.transcript_consequences.filter(
            lambda tc: (tc.biotype == 'protein_coding') &
            (tc.consequence_terms.any(lambda c: vep_consequences.contains(c))
             )).map(lambda x: x.gene_id)))

Esempio n. 23

0

Mostra file

File: test_codec.py Progetto: populationgenomics/hail

def test_complex_round_trips():
    assert_round_trip(hl.struct())
    assert_round_trip(hl.empty_array(hl.tint32))
    assert_round_trip(hl.empty_set(hl.tint32))
    assert_round_trip(hl.empty_dict(hl.tint32, hl.tint32))
    assert_round_trip(hl.locus('1', 100))
    assert_round_trip(hl.struct(x=3))
    assert_round_trip(hl.set([3, 4, 5, 3]))
    assert_round_trip(hl.array([3, 4, 5]))
    assert_round_trip(hl.dict({3: 'a', 4: 'b', 5: 'c'}))
    assert_round_trip(
        hl.struct(x=hl.dict({
            3: 'a',
            4: 'b',
            5: 'c'
        }),
                  y=hl.array([3, 4, 5]),
                  z=hl.set([3, 4, 5, 3])))

Esempio n. 24

0

Mostra file

def get_expr_for_worst_transcript_consequence_annotations_struct(
        vep_sorted_transcript_consequences_root,
        include_coding_annotations=True):
    """Retrieves the top-ranked transcript annotation based on the ranking computed by
    get_expr_for_vep_sorted_transcript_consequences_array(..)

    Args:
        vep_sorted_transcript_consequences_root (ArrayExpression):
        include_coding_annotations (bool):
    """

    transcript_consequences = {
        "biotype": hl.tstr,
        "canonical": hl.tint,
        "category": hl.tstr,
        "cdna_start": hl.tint,
        "cdna_end": hl.tint,
        "codons": hl.tstr,
        "gene_id": hl.tstr,
        "gene_symbol": hl.tstr,
        "hgvs": hl.tstr,
        "hgvsc": hl.tstr,
        "major_consequence": hl.tstr,
        "major_consequence_rank": hl.tint,
        "transcript_id": hl.tstr,
    }

    if include_coding_annotations:
        transcript_consequences.update({
            "amino_acids": hl.tstr,
            "domains": hl.tstr,
            "hgvsp": hl.tstr,
            "lof": hl.tstr,
            "lof_flags": hl.tstr,
            "lof_filter": hl.tstr,
            "lof_info": hl.tstr,
            "polyphen_prediction": hl.tstr,
            "protein_id": hl.tstr,
            "sift_prediction": hl.tstr,
        })

    return hl.cond(
        vep_sorted_transcript_consequences_root.size() == 0,
        hl.struct(
            **{
                field: hl.null(field_type)
                for field, field_type in transcript_consequences.items()
            }),
        hl.bind(
            lambda worst_transcript_consequence:
            (worst_transcript_consequence.annotate(domains=hl.delimit(
                hl.set(worst_transcript_consequence.domains))).select(
                    *transcript_consequences.keys())),
            vep_sorted_transcript_consequences_root[0],
        ),
    )

Esempio n. 25

0

Mostra file

File: filtering.py Progetto: broadinstitute/gnomad_methods

 def make_filters_expr(ht: hl.Table,
                       qc_metrics: Iterable[str]) -> hl.expr.SetExpression:
     return hl.set(
         hl.filter(
             lambda x: hl.is_defined(x),
             [
                 hl.or_missing(ht[f"fail_{metric}"], metric)
                 for metric in qc_metrics
             ],
         ))

Esempio n. 26

0

Mostra file

def vep_protein_domain_filter_expr(
        d: hl.expr.DictExpression) -> hl.expr.BooleanExpression:
    """

    Return True of False if any protein domain source(s) are contained within pre-defined protein domain sources.
    Expected as input dict<k,v> where keys (k) represent source/database and values (v) the annotated domain_id.

    :param d: hl.DictExpression
    :return: hl.BoolExpression
    """
    domain_dbs = hl.set(PROTEIN_DOMAIN_DB)
    return (d.key_set().intersection(domain_dbs).length() >= 1)

Esempio n. 27

0

Mostra file

File: load_vcfs_to_emr.py Progetto: nicklecompteBCH/seqr-bch-installation

def finalize_annotated_table_for_seqr_variants(
        mt: hl.MatrixTable) -> hl.MatrixTable:
    """Given a messily-but-completely annotated Hail MatrixTable of variants,
    return a new MatrixTable with appropriate formatting to export to Elasticsearch
    and consume  by Seqr.

    TO-EXTREMELY-DO: Create a app/common Python 3 module with code for SeqrAnnotatedVariant,
    with methods to im/export to/from Hail/Elasticsearch.

    :param vep_mt: A VCF loaded into hail 0.2, VEP has been run,
    and reference/computed fields have been added.
    :type vep_mt: hl.MatrixTable
    :return: A hail matrix table of variants and VEP annotations with
    proper formatting to be consumed by Seqr.
    :rtype: hl.MatrixTable
    """
    mt = mt.annotate_rows(
        sortedTranscriptConsequences=
        get_expr_for_vep_sorted_transcript_consequences_array(vep_root=mt.vep))

    mt = mt.annotate_rows(
        mainTranscript=hl.cond(
            hl.len(mt.sortedTranscriptConsequences) > 0,
            mt.sortedTranscriptConsequences[0],
            hl.null(
                "struct {biotype: str,canonical: int32,cdna_start: int32,cdna_end: int32,codons: str,gene_id: str,gene_symbol: str,hgvsc: str,hgvsp: str,transcript_id: str,amino_acids: str,lof: str,lof_filter: str,lof_flags: str,lof_info: str,polyphen_prediction: str,protein_id: str,protein_start: int32,sift_prediction: str,consequence_terms: array<str>,domains: array<str>,major_consequence: str,category: str,hgvs: str,major_consequence_rank: int32,transcript_rank: int32}"
            )),
        #allele_id=clinvar_mt.index_rows(mt.row_key).vep.id,
        alt=get_expr_for_alt_allele(mt),
        chrom=get_expr_for_contig(mt.locus),
        #clinvar_clinical_significance=clinvar_mt.index_rows(mt.row_key).clinical_significance,
        domains=get_expr_for_vep_protein_domains_set(
            vep_transcript_consequences_root=mt.vep.transcript_consequences),
        geneIds=hl.set(
            mt.vep.transcript_consequences.map(lambda c: c.gene_id)),
        # gene_id_to_consequence_json=get_expr_for_vep_gene_id_to_consequence_map(
        #     vep_sorted_transcript_consequences_root=mt.sortedTranscriptConsequences,
        #     gene_ids=clinvar_mt.gene_ids
        # ),
        #gold_stars= clinvar_mt.index_entries(mt.row_key,mt.col_key).gold_stars,
        pos=get_expr_for_start_pos(mt),
        ref=get_expr_for_ref_allele(mt),
        #review_status=clinvar_mt.index_rows(mt.locus,mt.alleles).review_status,
        transcript_consequence_terms=get_expr_for_vep_consequence_terms_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_ids=get_expr_for_vep_transcript_ids_set(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        transcript_id_to_consequence_json=
        get_expr_for_vep_transcript_id_to_consequence_map(
            vep_transcript_consequences_root=mt.sortedTranscriptConsequences),
        variant_id=get_expr_for_variant_id(mt),
        xpos=get_expr_for_xpos(mt.locus))
    return mt

Esempio n. 28

0

Mostra file

def get_exons(gencode):
    exons = gencode.filter(
        hl.set(["exon", "CDS", "UTR"]).contains(gencode.feature))
    exons = exons.select(
        feature_type=exons.feature,
        transcript_id=exons.transcript_id.split("\\.")[0],
        gene_id=exons.gene_id.split("\\.")[0],
        chrom=exons.interval.start.seqname[3:],
        strand=exons.strand,
        start=exons.interval.start.position,
        stop=exons.interval.end.position,
    )
    return exons

Esempio n. 29

0

Mostra file

File: pext.py Progetto: drhodesbrc/gnomad-browser

def prepare_pext_data(base_level_pext_path, low_max_pext_genes_path):
    ds = prepare_base_level_pext(base_level_pext_path)

    low_max_pext_genes = hl.import_table(low_max_pext_genes_path)
    low_max_pext_genes = low_max_pext_genes.aggregate(
        hl.agg.collect_as_set(low_max_pext_genes.ensg))
    ds = ds.annotate(flags=hl.if_else(
        hl.set(low_max_pext_genes).contains(ds.gene_id),
        hl.literal(["low_max_pext"]),
        hl.empty_array(hl.tstr),
    ))

    return ds

Esempio n. 30

0

Mostra file

File: fix_alleles.py Progetto: nikbaya/diverse_pops

def main(args):
    variants = hl.import_table(f'{ldprune_dir}/variants.txt', delimiter=' ')
    variants = variants.annotate(alleles = hl.set([variants.ref, variants.alt]))
    variants = variants.key_by('chrom','pos','alleles')
    
    
    all_pops = [
            'AFR', 
            'AMR', 
            'CSA', 
            'EAS', 
            'EUR', 
            'MID'
            ]

    pops = [args.pop.upper()] if args.pop!=None else all_pops
    
    for pop in pops:
        bim0 = hl.import_table(f'{ldprune_dir}/subsets_5k/not_{pop}/not_{pop}.bim', 
                              delimiter=' ', 
                              no_header=True)
        bim0 = bim0.rename({'f0':'chrom',
                          'f3':'pos',
                          'f4':'A1_bim',
                          'f5':'A2_bim'})
        bim0 = bim0.add_index()

        bim = bim0.annotate(alleles = hl.set([bim0.A1_bim, bim0.A2_bim]))
        bim = bim.key_by('chrom','pos','alleles')
        
        bim = bim.annotate(ref = variants[bim.key].ref,
                           alt = variants[bim.key].alt)
        bim = bim.annotate(SNP = bim.chrom+':'+bim.pos+':'+bim.ref+':'+bim.alt)
        bim = bim.key_by()
        bim = bim.order_by(bim.idx)
        bim = bim.select('chrom','SNP','f2','pos','A1_bim','A2_bim')
        bim.export(f'{ldprune_dir}/subsets_5k/not_{pop}/not_{pop}.bim_new', 
                   header=False, 
                   delimiter=' ')

Esempio n. 31

0

Mostra file

File: extract_vcf_from_mt.py Progetto: wlu04/ukb_common

def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'),
            default_reference=args.reference)

    sys.path.append('/')
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    load_module = importlib.import_module(args.load_module)
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    if args.gene_map_ht_path is None:
        interval = [hl.parse_locus_interval(args.interval)]
    else:
        gene_ht = hl.read_table(args.gene_map_ht_path)
        if args.gene is not None:
            gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene)
            interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False)
        else:
            interval = [hl.parse_locus_interval(args.interval)]
            gene_ht = hl.filter_intervals(gene_ht, interval)

        gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation))
        gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t')
                       ).key_by().drop('start').export(args.group_output_file, header=False)
        # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants`

    if not args.no_adj:
        mt = mt.filter_entries(mt.adj)

    mt = hl.filter_intervals(mt, interval)

    if not args.input_bgen:
        mt = mt.select_entries('GT')
        mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0)
    mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1])

    if args.callrate_filter:
        mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter)

    if args.export_bgen:
        if not args.input_bgen:
            mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT))
            mt = gt_to_gp(mt)
            mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing)
        hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid)
    else:
        mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0)))
        # Note: no mean-imputation for VCF
        hl.export_vcf(mt, args.output_file)

Esempio n. 32

0

Mostra file

def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("gencode")
    parser.add_argument("canonical_transcripts")
    parser.add_argument("hgnc")
    parser.add_argument("--min-partitions", type=int, default=8)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    # Load genes from GTF file
    genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions)
    genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol)

    # Annotate genes with canonical transcript
    canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions)
    genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id)

    # Drop transcripts except for canonical
    genes = genes.annotate(
        canonical_transcript=genes.transcripts.filter(
            lambda transcript: transcript.transcript_id == genes.canonical_transcript_id
        ).head()
    )
    genes = genes.drop("transcripts")

    # Annotate genes with information from HGNC
    hgnc = load_hgnc(args.hgnc)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr)))
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol),
        symbol_source=hl.or_else(genes.symbol_source, "gencode"),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(genes.previous_symbols)
            .extend(genes.alias_symbols)
            .append(genes.gencode_gene_symbol)
            .map(lambda s: s.upper())
        ),
    )

    genes.describe()

    genes.write(args.output, overwrite=True)

Esempio n. 33

0

Mostra file

def prepare_clinvar_variants(clinvar_path, reference_genome):
    ds = hl.read_table(clinvar_path)

    ds = ds.filter(hl.is_defined(ds[f"locus_{reference_genome}"]) & hl.is_defined(ds[f"alleles_{reference_genome}"]))

    ds = ds.select(locus=ds[f"locus_{reference_genome}"], alleles=ds[f"alleles_{reference_genome}"], **ds.variant)

    # Remove any variants with alleles other than ACGT
    ds = ds.filter(
        hl.len(hl.set(hl.delimit(ds.alleles, "").split("")).difference(hl.set(["A", "C", "G", "T", ""]))) == 0
    )

    ds = ds.annotate(
        variant_id=variant_id(ds.locus, ds.alleles),
        chrom=normalized_contig(ds.locus.contig),
        pos=ds.locus.position,
        ref=ds.alleles[0],
        alt=ds.alleles[1],
    )

    ds = ds.key_by("locus", "alleles")

    return ds

Esempio n. 34

0

Mostra file

File: gene.py Progetto: drhodesbrc/gnomad-browser

def import_hgnc(path):
    ds = hl.import_table(path, missing="")

    ds = ds.select(
        hgnc_id=ds["HGNC ID"],
        symbol=ds["Approved symbol"],
        name=ds["Approved name"],
        previous_symbols=ds["Previous symbols"],
        alias_symbols=ds["Alias symbols"],
        omim_id=ds["OMIM ID(supplied by OMIM)"],
        gene_id=hl.or_else(ds["Ensembl gene ID"],
                           ds["Ensembl ID(supplied by Ensembl)"]),
    )
    ds = ds.filter(hl.is_defined(ds.gene_id)).key_by("gene_id")

    ds = ds.annotate(
        previous_symbols=hl.set(
            ds.previous_symbols.split(",").map(lambda s: s.strip())),
        alias_symbols=hl.set(
            ds.alias_symbols.split(",").map(lambda s: s.strip())),
    )

    return ds

Esempio n. 35

0

Mostra file

    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')

        hl.index_bgen(bgen_file, contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000),
                      alleles=alleles),  # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=1,  # forcing seek to be called
            variants=desired_variants)
        self.assertTrue(part_1.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(
            bgen_file,
            ['GT'],
            n_partitions=
            199,  # forcing each variant to be its own partition for testing duplicates work properly
            variants=desired_variants)
        self.assertTrue(part_199.rows().key_by(
            'locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(
            hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))

Esempio n. 36

0

Mostra file

File: test_impex.py Progetto: lfrancioli/hail

    def test_import_bgen_variant_filtering_from_literals(self):
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding={'01': '1'})

        alleles = ['A', 'G']

        desired_variants = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        expected_result = [
            hl.Struct(locus=hl.Locus('1', 2000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 2001), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 4000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles),
            hl.Struct(locus=hl.Locus('1', 10000), alleles=alleles), # Duplicated variant
            hl.Struct(locus=hl.Locus('1', 100001), alleles=alleles),
        ]

        part_1 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=1, # forcing seek to be called
                                variants=desired_variants)
        self.assertTrue(part_1.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        part_199 = hl.import_bgen(bgen_file,
                                ['GT'],
                                n_partitions=199, # forcing each variant to be its own partition for testing duplicates work properly
                                variants=desired_variants)
        self.assertTrue(part_199.rows().key_by('locus', 'alleles').select().collect() == expected_result)

        everything = hl.import_bgen(bgen_file, ['GT'])
        self.assertEqual(everything.count(), (199, 500))

        expected = everything.filter_rows(hl.set(desired_variants).contains(everything.row_key))

        self.assertTrue(expected._same(part_1))

Esempio n. 37

0

Mostra file

File: helpers.py Progetto: jigold/hail

def create_all_values():
    return hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

Esempio n. 38

0

Mostra file

File: test_file_formats.py Progetto: bcajes/hail

def create_all_values_datasets():
    all_values = hl.struct(
        f32=hl.float32(3.14),
        i64=hl.int64(-9),
        m=hl.null(hl.tfloat64),
        astruct=hl.struct(a=hl.null(hl.tint32), b=5.5),
        mstruct=hl.null(hl.tstruct(x=hl.tint32, y=hl.tstr)),
        aset=hl.set(['foo', 'bar', 'baz']),
        mset=hl.null(hl.tset(hl.tfloat64)),
        d=hl.dict({hl.array(['a', 'b']): 0.5, hl.array(['x', hl.null(hl.tstr), 'z']): 0.3}),
        md=hl.null(hl.tdict(hl.tint32, hl.tstr)),
        h38=hl.locus('chr22', 33878978, 'GRCh38'),
        ml=hl.null(hl.tlocus('GRCh37')),
        i=hl.interval(
            hl.locus('1', 999),
            hl.locus('1', 1001)),
        c=hl.call(0, 1),
        mc=hl.null(hl.tcall),
        t=hl.tuple([hl.call(1, 2, phased=True), 'foo', hl.null(hl.tstr)]),
        mt=hl.null(hl.ttuple(hl.tlocus('GRCh37'), hl.tbool))
    )

    def prefix(s, p):
        return hl.struct(**{p + k: s[k] for k in s})

    all_values_table = (hl.utils.range_table(5, n_partitions=3)
                        .annotate_globals(**prefix(all_values, 'global_'))
                        .annotate(**all_values)
                        .cache())

    all_values_matrix_table = (hl.utils.range_matrix_table(3, 2, n_partitions=2)
                               .annotate_globals(**prefix(all_values, 'global_'))
                               .annotate_rows(**prefix(all_values, 'row_'))
                               .annotate_cols(**prefix(all_values, 'col_'))
                               .annotate_entries(**prefix(all_values, 'entry_'))
                               .cache())

    return all_values_table, all_values_matrix_table

Esempio n. 39

0

Mostra file

File: ld_score_regression.py Progetto: jigold/hail

def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht

Esempio n. 40

0

Mostra file

File: test_family_methods.py Progetto: jigold/hail

    def test_mendel_errors(self):
        mt = hl.import_vcf(resource('mendel.vcf'))
        ped = hl.Pedigree.read(resource('mendel.fam'))

        men, fam, ind, var = hl.mendel_errors(mt['GT'], ped)

        self.assertEqual(men.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr))
        self.assertEqual(men.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   mendel_code=hl.tint))
        self.assertEqual(fam.key.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr))
        self.assertEqual(fam.row.dtype, hl.tstruct(pat_id=hl.tstr,
                                                   mat_id=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   children=hl.tint,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(ind.key.dtype, hl.tstruct(s=hl.tstr))
        self.assertEqual(ind.row.dtype, hl.tstruct(s=hl.tstr,
                                                   fam_id=hl.tstr,
                                                   errors=hl.tint64,
                                                   snp_errors=hl.tint64))
        self.assertEqual(var.key.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr)))
        self.assertEqual(var.row.dtype, hl.tstruct(locus=mt.locus.dtype,
                                                   alleles=hl.tarray(hl.tstr),
                                                   errors=hl.tint64))

        self.assertEqual(men.count(), 41)
        self.assertEqual(fam.count(), 2)
        self.assertEqual(ind.count(), 7)
        self.assertEqual(var.count(), mt.count_rows())

        self.assertEqual(set(fam.select('children', 'errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(pat_id='Dad1', mat_id='Mom1', children=2,
                                             errors=41, snp_errors=39),
                             hl.utils.Struct(pat_id='Dad2', mat_id='Mom2', children=1,
                                             errors=0, snp_errors=0)
                         })

        self.assertEqual(set(ind.select('errors', 'snp_errors').collect()),
                         {
                             hl.utils.Struct(s='Son1', errors=23, snp_errors=22),
                             hl.utils.Struct(s='Dtr1', errors=18, snp_errors=17),
                             hl.utils.Struct(s='Dad1', errors=19, snp_errors=18),
                             hl.utils.Struct(s='Mom1', errors=22, snp_errors=21),
                             hl.utils.Struct(s='Dad2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Mom2', errors=0, snp_errors=0),
                             hl.utils.Struct(s='Son2', errors=0, snp_errors=0)
                         })

        to_keep = hl.set([
            (hl.Locus("1", 1), ['C', 'CT']),
            (hl.Locus("1", 2), ['C', 'T']),
            (hl.Locus("X", 1), ['C', 'T']),
            (hl.Locus("X", 3), ['C', 'T']),
            (hl.Locus("Y", 1), ['C', 'T']),
            (hl.Locus("Y", 3), ['C', 'T'])
        ])
        self.assertEqual(var.filter(to_keep.contains((var.locus, var.alleles)))
                         .order_by('locus')
                         .select('locus', 'alleles', 'errors').collect(),
                         [
                             hl.utils.Struct(locus=hl.Locus("1", 1), alleles=['C', 'CT'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("1", 2), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("X", 1), alleles=['C', 'T'], errors=2),
                             hl.utils.Struct(locus=hl.Locus("X", 3), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 1), alleles=['C', 'T'], errors=1),
                             hl.utils.Struct(locus=hl.Locus("Y", 3), alleles=['C', 'T'], errors=1),
                         ])

        ped2 = hl.Pedigree.read(resource('mendelWithMissingSex.fam'))
        men2, _, _, _ = hl.mendel_errors(mt['GT'], ped2)

        self.assertTrue(men2.filter(men2.s == 'Dtr1')._same(men.filter(men.s == 'Dtr1')))

Esempio n. 41

0

Mostra file

File: vcf_combiner.py Progetto: bcajes/hail

def merge_alleles(alleles) -> ArrayExpression:
    return hl.array(hl.set(hl.flatten(alleles)))