コード例 #1
0
            def with_pl(pl):
                new_exprs = {}
                dropped_fields = ['LA']
                if 'LGT' in fields:
                    new_exprs['GT'] = hl.rbind(
                        old_entry.LGT, lambda lgt: hl.if_else(
                            lgt.is_non_ref(),
                            hl.downcode(
                                lgt,
                                hl.or_else(local_a_index, hl.len(old_entry.LA))
                            ), lgt))
                    dropped_fields.append('LGT')
                if 'LPGT' in fields:
                    new_exprs['PGT'] = hl.rbind(
                        old_entry.LPGT, lambda lpgt: hl.if_else(
                            lpgt.is_non_ref(),
                            hl.downcode(
                                lpgt,
                                hl.or_else(local_a_index, hl.len(old_entry.LA))
                            ), lpgt))
                    dropped_fields.append('LPGT')
                if 'LAD' in fields:
                    non_ref_ad = hl.or_else(old_entry.LAD[local_a_index],
                                            0)  # zeroed if not in LAD
                    new_exprs['AD'] = hl.or_missing(
                        hl.is_defined(old_entry.LAD),
                        [hl.sum(old_entry.LAD) - non_ref_ad, non_ref_ad])
                    dropped_fields.append('LAD')
                if 'LPL' in fields:
                    new_exprs['PL'] = pl
                    if 'GQ' in fields:
                        new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl),
                                                     old_entry.GQ)

                    dropped_fields.append('LPL')

                return (hl.case().when(
                    hl.len(ds.alleles) == 1,
                    old_entry.annotate(
                        **{
                            f[1:]: old_entry[f]
                            for f in ['LGT', 'LPGT', 'LAD', 'LPL']
                            if f in fields
                        }).drop(*dropped_fields)).when(
                            hl.or_else(old_entry.LGT.is_hom_ref(), False),
                            old_entry.annotate(
                                **{
                                    f: old_entry[f'L{f}'] if f in
                                    ['GT', 'PGT'] else e
                                    for f, e in new_exprs.items()
                                }).drop(*dropped_fields)).default(
                                    old_entry.annotate(**new_exprs).drop(
                                        *dropped_fields)))
コード例 #2
0
ファイル: phenotype_loading.py プロジェクト: wlu04/ukb_common
def load_covid_data(all_samples_ht: hl.Table, covid_data_path: str, wave: str = '01'):
    print(f'Loading COVID wave {wave}...')
    covid_ht = hl.import_table(covid_data_path, delimiter='\t', missing='', impute=True, key='eid')
    covid_ht = covid_ht.group_by('eid').aggregate(
        origin=hl.agg.any(covid_ht.origin == 1),
        result=hl.agg.any(covid_ht.result == 1),
        inpatient=hl.agg.any(covid_ht.reqorg == 1),
    )

    # TODO: add aoo parse to separate trait_type (covid_quantitative?)
    # dob = load_dob_ht(pre_phesant_tsv_path)[ht.key].date_of_birth
    # ht = ht.annotate(aoo=hl.or_missing(ht.result == 1, hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT') - dob),
    #                  specdate=hl.experimental.strptime(ht.specdate + ' 00:00:00', '%d/%m/%Y %H:%M:%S', 'GMT')).drop('specdate')

    ht = all_samples_ht.annotate(**covid_ht[all_samples_ht.key])
    centers = hl.literal(ENGLAND_RECRUITMENT_CENTERS)

    analyses = {
        'B1_v2': hl.or_missing(ht.result, ht.inpatient),  # fka ANA2
        'B1_v2_origin': hl.or_missing(ht.result, ht.origin),  # fka ANA2
        'C2_v2': hl.or_else(ht.result, False),  # fka ANA5
        'C2_v2_england_controls': hl.or_missing(centers.contains(ht.recruitment_center),  # fka ANA5_england_controls
                                               hl.or_else(ht.result, False)),
        'C1_v2': ht.result,  # fka ANA5_strict
        'B2_v2': hl.or_else(ht.result & ht.inpatient, False),  # fka ANA6
        'B2_v2_origin': hl.or_else(ht.result & ht.origin, False)  # fka ANA6
    }
    analysis_names = {
        'B1_v2': 'Hospitalized vs non-hospitalized (among COVID-19 positive)',  # fka ANA2
        'B1_v2_origin': 'Hospitalized vs non-hospitalized (among COVID-19 positive; old definition using "origin" field)',  # fka ANA2
        'C2_v2': 'COVID-19 positive (controls include untested)',  # fka ANA5
        'C2_v2_england_controls': 'COVID-19 positive (controls include untested), only patients from centers in England',  # fka ANA5_england_controls
        'C1_v2': 'COVID-19 positive (controls only COVID-19 negative)',  # fka ANA5_strict
        'B2_v2': 'Hospitalized vs non-hospitalized (controls include untested)',  # ANA6
        'B2_v2_origin': 'Hospitalized vs non-hospitalized (controls include untested; old definition using "origin" field)'  # ANA6
    }
    assert set(analyses.keys()) == set(analysis_names.keys())

    ht = ht.select(**analyses)
    mt = filter_and_annotate_ukb_data(ht, lambda k, v: True, annotate_with_showcase=False,
                                      format_col_name=lambda x: x)
    mt = mt.key_cols_by(trait_type='categorical', phenocode='COVID19', pheno_sex='both_sexes',
                        coding=mt.phenocode, modifier=wave)
    mt = mt.annotate_cols(description=hl.literal(analysis_names)[mt.coding])

    mt.annotate_cols(
        n_cases=hl.agg.count_where(mt.value == 1.0),
        n_controls=hl.agg.count_where(mt.value == 0.0)
    ).cols().show()

    return mt
コード例 #3
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument("gencode")
    parser.add_argument("canonical_transcripts")
    parser.add_argument("hgnc")
    parser.add_argument("--min-partitions", type=int, default=8)
    parser.add_argument("--output", required=True)
    args = parser.parse_args()

    # Load genes from GTF file
    genes = load_gencode_gene_models(args.gencode, min_partitions=args.min_partitions)
    genes = genes.transmute(gencode_gene_symbol=genes.gene_symbol)

    # Annotate genes with canonical transcript
    canonical_transcripts = load_canonical_transcripts(args.canonical_transcripts, min_partitions=args.min_partitions)
    genes = genes.annotate(canonical_transcript_id=canonical_transcripts[genes.gene_id].transcript_id)

    # Drop transcripts except for canonical
    genes = genes.annotate(
        canonical_transcript=genes.transcripts.filter(
            lambda transcript: transcript.transcript_id == genes.canonical_transcript_id
        ).head()
    )
    genes = genes.drop("transcripts")

    # Annotate genes with information from HGNC
    hgnc = load_hgnc(args.hgnc)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(symbol_source=hl.cond(hl.is_defined(genes.symbol), "hgnc", hl.null(hl.tstr)))
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, genes.gencode_gene_symbol),
        symbol_source=hl.or_else(genes.symbol_source, "gencode"),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        symbol_upper_case=genes.symbol.upper(),
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(genes.previous_symbols)
            .extend(genes.alias_symbols)
            .append(genes.gencode_gene_symbol)
            .map(lambda s: s.upper())
        ),
    )

    genes.describe()

    genes.write(args.output, overwrite=True)
コード例 #4
0
ファイル: test_linalg.py プロジェクト: danking/hail
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self._assert_eq(a1, a4)
コード例 #5
0
    def test_from_entry_expr(self):
        mt = get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self._assert_eq(a1, a2)
        self._assert_eq(a1, a3)

        with hl.TemporaryDirectory(ensure_exists=False) as path:
            BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
            a4 = BlockMatrix.read(path).to_numpy()
            self._assert_eq(a1, a4)
コード例 #6
0
 def annotate_related_pairs(related_pairs: hl.Table,
                            index_col: str) -> hl.Table:
     related_pairs = related_pairs.key_by(**related_pairs[index_col])
     related_pairs = related_pairs.filter(
         hl.is_missing(case_parents[related_pairs.key]))
     return related_pairs.annotate(
         **{
             index_col:
             related_pairs[index_col].annotate(
                 case_rank=hl.or_else(
                     hl.int(meta_ht[related_pairs.key].is_case), -1),
                 dp_mean=hl.or_else(
                     sample_qc_ht[
                         related_pairs.key].sample_qc.dp_stats.mean, -1.0))
         }).key_by()
コード例 #7
0
    def test_from_entry_expr(self):
        mt = self.get_dataset()
        mt = mt.annotate_entries(x=hl.or_else(mt.GT.n_alt_alleles(), 0)).cache()

        a1 = BlockMatrix.from_entry_expr(hl.or_else(mt.GT.n_alt_alleles(), 0), block_size=32).to_numpy()
        a2 = BlockMatrix.from_entry_expr(mt.x, block_size=32).to_numpy()
        a3 = BlockMatrix.from_entry_expr(hl.float64(mt.x), block_size=32).to_numpy()

        self.assertTrue(np.array_equal(a1, a2))
        self.assertTrue(np.array_equal(a1, a3))

        path = new_temp_file()
        BlockMatrix.write_from_entry_expr(mt.x, path, block_size=32)
        a4 = BlockMatrix.read(path).to_numpy()
        self.assertTrue(np.array_equal(a1, a4))
コード例 #8
0
def annotate_variants_gnomad_mismatch(mt, gnomad_mismatch_ht):
    """
    Imports a list of 'bad' variants that have significantly different frequencies between gnomad exomes and genomes
    in NFE population.
    :param mt: matrix table to annotate
    :param gnomad_mismatch_ht: string with file location + name of gnomad mismatch variant file to load
    :return: returns annotated matrix table
    """
    gnomad_mismatch_list = hl.read_table(gnomad_mismatch_ht)
    gnomad_mismatch_list = gnomad_mismatch_list.annotate(gnomad_mismatch=True)

    # gnomad_mismatch True/False boolean annotation
    mt = mt.annotate_rows(
        gnomad_mismatch_pvalue=gnomad_mismatch_list.index(mt.row_key).p_value,
        gnomad_mismatch_variantid=gnomad_mismatch_list.index(
            mt.row_key).variant,
        gnomad_mismatch=gnomad_mismatch_list.index(mt.row_key).gnomad_mismatch)

    # Fill in empty values for gnomad mismatch with False
    mt = mt.annotate_rows(
        gnomad_mismatch=hl.or_else(mt.gnomad_mismatch, False))

    mt = mt.annotate_globals(gnomad_mismatch_file=gnomad_mismatch_ht)

    return mt
コード例 #9
0
 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))
コード例 #10
0
def make_sumstats_bm(sumstats_bm_path, high_quality):
    meta_mt = hl.read_matrix_table(get_meta_analysis_results_path())
    clump_mt = hl.read_matrix_table(
        get_clumping_results_path(high_quality_only=high_quality)).rename(
            {'pop': 'clump_pops'})
    mt = all_axis_join(meta_mt, clump_mt)
    mt = separate_results_mt_by_pop(mt,
                                    'clump_pops',
                                    'plink_clump',
                                    skip_drop=True)
    mt = separate_results_mt_by_pop(mt,
                                    'meta_analysis_data',
                                    'meta_analysis',
                                    skip_drop=True)
    mt = mt.filter_cols(mt.meta_analysis_data.pop == mt.clump_pops)
    mt = explode_by_p_threshold(mt).unfilter_entries()

    mt = mt.filter_cols((mt.description == 'Type 2 diabetes')
                        & (mt.p_threshold == 1))

    BlockMatrix.write_from_entry_expr(hl.or_else(
        mt.meta_analysis.BETA * hl.is_defined(mt.plink_clump.TOTAL) *
        hl.int(mt.meta_analysis.Pvalue < mt.p_threshold), 0.0),
                                      sumstats_bm_path,
                                      overwrite=True)
コード例 #11
0
ファイル: tx_annotation.py プロジェクト: xjyx/tx_annotation
def pull_out_worst_from_tx_annotate(mt):
    csq_order = []
    for loftee_filter in ["HC", "LC"]:
        for no_flag in [True, False]:
            for consequence in CSQ_CODING_HIGH_IMPACT:
                csq_order.append((loftee_filter, no_flag, consequence))

    # prioritization of mis and syn variant on protein coding transcripts
    csq_order.extend([(hl.null(hl.tstr), True, x)
                      for x in CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT
                      ])

    # Any variant on a non protein coding transcript (ie. where LOF = None)
    csq_order.extend([(hl.null(hl.tstr), True, x)
                      for x in CSQ_CODING_HIGH_IMPACT +
                      CSQ_CODING_MEDIUM_IMPACT + CSQ_CODING_LOW_IMPACT])

    csq_order = hl.literal({(x): i for i, x in enumerate(csq_order)})

    mt = mt.annotate_rows(**hl.sorted(
        mt.tx_annotation,
        key=lambda x: csq_order[
            (x.lof, hl.or_else(hl.is_missing(x.lof_flag), False), x.csq)])[0])

    return mt
コード例 #12
0
ファイル: vcf_combiner.py プロジェクト: jigold/hail
 def merge_alleles(alleles):
     from hail.expr.functions import _num_allele_type, _allele_ints
     return hl.rbind(
         alleles.map(lambda a: hl.or_else(a[0], ''))
                .fold(lambda s, t: hl.cond(hl.len(s) > hl.len(t), s, t), ''),
         lambda ref:
         hl.rbind(
             alleles.map(
                 lambda al: hl.rbind(
                     al[0],
                     lambda r:
                     hl.array([ref]).extend(
                         al[1:].map(
                             lambda a:
                             hl.rbind(
                                 _num_allele_type(r, a),
                                 lambda at:
                                 hl.cond(
                                     (_allele_ints['SNP'] == at) |
                                     (_allele_ints['Insertion'] == at) |
                                     (_allele_ints['Deletion'] == at) |
                                     (_allele_ints['MNP'] == at) |
                                     (_allele_ints['Complex'] == at),
                                     a + ref[hl.len(r):],
                                     a)))))),
             lambda lal:
             hl.struct(
                 globl=hl.array([ref]).extend(hl.array(hl.set(hl.flatten(lal)).remove(ref))),
                 local=lal)))
コード例 #13
0
def hwe_normalize(call_expr):
    mt = matrix_table_source('hwe_normalize/call_expr', call_expr)
    mt = mt.select_entries(__gt=call_expr.n_alt_alleles())
    mt = mt.annotate_rows(__AC=agg.sum(mt.__gt),
                          __n_called=agg.count_where(hl.is_defined(mt.__gt)))
    mt = mt.filter_rows((mt.__AC > 0) & (mt.__AC < 2 * mt.__n_called))

    n_variants = mt.count_rows()
    if n_variants == 0:
        raise FatalError(
            "hwe_normalize: found 0 variants after filtering out monomorphic sites."
        )
    info(
        f"hwe_normalize: found {n_variants} variants after filtering out monomorphic sites."
    )

    mt = mt.annotate_rows(__mean_gt=mt.__AC / mt.__n_called)
    mt = mt.annotate_rows(__hwe_scaled_std_dev=hl.sqrt(mt.__mean_gt *
                                                       (2 - mt.__mean_gt) *
                                                       n_variants / 2))
    mt = mt.unfilter_entries()

    normalized_gt = hl.or_else(
        (mt.__gt - mt.__mean_gt) / mt.__hwe_scaled_std_dev, 0.0)
    return normalized_gt
コード例 #14
0
ファイル: create_fam.py プロジェクト: edenkal13/gnomad_qc
def run_mendel_errors() -> hl.Table:
    meta_ht = meta.ht()
    ped = pedigree.versions[f"{CURRENT_RELEASE}_raw"].pedigree()
    logger.info(f"Running Mendel errors for {len(ped.trios)} trios.")

    fake_ped = create_fake_pedigree(
        n=100,
        sample_list=list(
            meta_ht.aggregate(
                hl.agg.filter(
                    hl.rand_bool(0.01)
                    & ((hl.len(meta_ht.qc_metrics_filters) == 0)
                       & hl.or_else(hl.len(meta_ht.hard_filters) == 0, False)),
                    hl.agg.collect_as_set(meta_ht.s),
                ))),
        real_pedigree=ped,
    )
    merged_ped = hl.Pedigree(trios=ped.trios + fake_ped.trios)

    ped_samples = hl.literal(
        set([
            s for trio in merged_ped.trios
            for s in [trio.s, trio.pat_id, trio.mat_id]
        ]))
    mt = get_gnomad_v3_mt(key_by_locus_and_alleles=True)
    mt = mt.filter_cols(ped_samples.contains(mt.s))
    mt = hl.filter_intervals(
        mt, [hl.parse_locus_interval("chr20", reference_genome='GRCh38')])
    mt = hl.experimental.sparse_split_multi(mt, filter_changed_loci=True)
    mt = mt.select_entries("GT", "END")
    mt = hl.experimental.densify(mt)
    mt = mt.filter_rows(hl.len(mt.alleles) == 2)
    mendel_errors, _, _, _ = hl.mendel_errors(mt["GT"], merged_ped)
    return mendel_errors
コード例 #15
0
def combine_pheno_files(pheno_file_dict: dict):
    full_mt: hl.MatrixTable = None
    for data_type, mt in pheno_file_dict.items():
        if 'pheno' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=hl.str(mt.pheno), coding=mt.coding)
            criteria = mt.value if data_type == 'categorical' else hl.is_defined(
                mt.value)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(criteria))
            mt = mt.select_entries(value=hl.float64(mt.value))
        elif 'icd_code' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=mt.icd_code, coding=mt.icd_version)
            mt = mt.filter_cols(mt.truncated)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.any_codes))
            mt = mt.select_entries(value=hl.float64(mt.any_codes))
        elif 'phecode' in list(mt.col_key):
            mt = mt.key_cols_by(pheno=mt.phecode, coding=mt.phecode_sex)
            mt = mt.annotate_cols(n_cases=hl.agg.count_where(mt.case_control))
            mt = mt.select_entries(value=hl.float64(mt.case_control))
        elif 'Generic_Name' in list(mt.col_key):
            mt = mt.select_entries(
                value=hl.float64(hl.or_else(hl.len(mt.values) > 0, False)))
            mt2 = mt.group_cols_by(
                pheno=mt.Drug_Category_and_Indication,
                coding=mt.Drug_Category_and_Indication).aggregate(
                    value=hl.float64(hl.agg.any(mt.value > 0)))
            mt = mt.key_cols_by(
                pheno=mt.Generic_Name,
                coding=mt.Drug_Category_and_Indication).select_cols()
            mt = mt.union_cols(mt2)
            mt = mt.annotate_cols(n_cases=hl.int64(hl.agg.sum(mt.value)))
        else:
            raise ValueError(
                'pheno or icd_code not in column key. New data type?')
        mt = mt.select_cols('n_cases',
                            data_type=data_type,
                            n_defined=hl.agg.count_where(
                                hl.is_defined(mt.value)))
        if full_mt is None:
            full_mt = mt
        else:
            full_mt = full_mt.union_cols(mt,
                                         row_join_type='outer' if data_type
                                         == 'prescriptions' else 'inner')
    full_mt = full_mt.unfilter_entries()
    return full_mt.select_entries(value=hl.cond(
        full_mt.data_type == 'prescriptions',
        hl.or_else(full_mt.value, hl.float64(0.0)), full_mt.value))
コード例 #16
0
def prepare_variant_results():
    results_path = pipeline_config.get("SCHEMA", "variant_results_path")
    annotations_path = pipeline_config.get("SCHEMA",
                                           "variant_annotations_path")

    results = hl.read_table(results_path)

    results = results.drop("v", "af_case", "af_ctrl")

    # Add n_denovos to AC_case
    results = results.annotate(ac_case=hl.or_else(results.ac_case, 0) +
                               hl.or_else(results.n_denovos, 0))

    results = results.annotate(
        source=hl.delimit(hl.sorted(hl.array(results.source)), ", "))

    results = results.group_by(
        "locus",
        "alleles").aggregate(group_results=hl.agg.collect(results.row_value))
    results = results.annotate(group_results=hl.dict(
        results.group_results.map(lambda group_result:
                                  (group_result.analysis_group,
                                   group_result.drop("analysis_group")))))

    variants = hl.read_table(annotations_path)
    variants = variants.select(
        gene_id=variants.gene_id,
        consequence=hl.case().when(
            (variants.canonical_term == "missense_variant") &
            (variants.mpc >= 3), "missense_variant_mpc_>=3").when(
                (variants.canonical_term == "missense_variant") &
                (variants.mpc >= 2), "missense_variant_mpc_2-3").when(
                    variants.canonical_term == "missense_variant",
                    "missense_variant_mpc_<2").default(
                        variants.canonical_term),
        hgvsc=variants.hgvsc_canonical.split(":")[-1],
        hgvsp=variants.hgvsp_canonical.split(":")[-1],
        info=hl.struct(cadd=variants.cadd,
                       mpc=variants.mpc,
                       polyphen=variants.polyphen),
    )

    variants = variants.annotate(**results[variants.key])
    variants = variants.filter(hl.is_defined(variants.group_results))

    return variants
コード例 #17
0
def impute_missing_gp(mt, location: str = 'GP', mean_impute: bool = True):
    mt = mt.annotate_entries(_gp = mt[location])
    if mean_impute:
        mt = mt.annotate_rows(_mean_gp=hl.agg.array_agg(lambda x: hl.agg.mean(x), mt._gp))
        gp_expr = mt._mean_gp
    else:
        gp_expr = [1.0, 0.0, 0.0]
    return mt.annotate_entries(**{location: hl.or_else(mt._gp, gp_expr)}).drop('_gp')
コード例 #18
0
ファイル: sparse_split_multi.py プロジェクト: troels/hail
            def with_pl(pl):
                new_exprs = {}
                dropped_fields = ['LA']
                if 'LGT' in fields:
                    new_exprs['GT'] = hl.downcode(
                        old_entry.LGT,
                        hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LGT')
                if 'LPGT' in fields:
                    new_exprs['PGT'] = hl.downcode(
                        old_entry.LPGT,
                        hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LPGT')
                if 'LAD' in fields:
                    new_exprs['AD'] = hl.or_missing(
                        hl.is_defined(old_entry.LAD), [
                            old_entry.LAD[0],
                            hl.or_else(old_entry.LAD[local_a_index], 0)
                        ])  # second entry zeroed for lack of non-ref AD
                    dropped_fields.append('LAD')
                if 'LPL' in fields:
                    new_exprs['PL'] = pl
                    if 'GQ' in fields:
                        new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl),
                                                     old_entry.GQ)

                    dropped_fields.append('LPL')

                return (hl.case().when(
                    hl.len(ds.alleles) == 1,
                    old_entry.annotate(
                        **{
                            f[1:]: old_entry[f]
                            for f in ['LGT', 'LPGT', 'LAD', 'LPL']
                            if f in fields
                        }).drop(*dropped_fields)).when(
                            hl.or_else(old_entry.LGT.is_hom_ref(), False),
                            old_entry.annotate(
                                **{
                                    f: old_entry[f'L{f}'] if f in
                                    ['GT', 'PGT'] else e
                                    for f, e in new_exprs.items()
                                }).drop(*dropped_fields)).default(
                                    old_entry.annotate(**new_exprs).drop(
                                        *dropped_fields)))
コード例 #19
0
def generate_final_rf_ht(
    ht: hl.Table,
    snp_cutoff: Union[int, float],
    indel_cutoff: Union[int, float],
    inbreeding_coeff_cutoff: float = INBREEDING_COEFF_HARD_CUTOFF,
    determine_cutoff_from_bin: bool = False,
    aggregated_bin_ht: Optional[hl.Table] = None,
    bin_id: Optional[hl.expr.Int32Expression] = None,
) -> hl.Table:
    """
    Prepares finalized RF model given an RF result table from `rf.apply_rf_model` and cutoffs for filtering.
    If `determine_cutoff_from_bin` is True, `aggregated_bin_ht` must be supplied to determine the SNP and indel RF
    probabilities to use as cutoffs from an aggregated quantile bin Table like one created by
    `compute_grouped_binned_ht` in combination with `score_bin_agg`.
    :param ht: RF result table from `rf.apply_rf_model` to prepare as the final RF Table
    :param ac0_filter_expr: Expression that indicates if a variant should be filtered as allele count 0 (AC0)
    :param ts_ac_filter_expr: Expression in `ht` that indicates if a variant is a transmitted singleton
    :param mono_allelic_fiter_expr: Expression indicating if a variant is mono-allelic
    :param snp_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for SNP variant QC filter
    :param indel_cutoff: RF probability or bin (if `determine_cutoff_from_bin` True) to use for indel variant QC filter
    :param inbreeding_coeff_cutoff: InbreedingCoeff hard filter to use for variants
    :param determine_cutoff_from_bin: If True RF probability will be determined using bin info in `aggregated_bin_ht`
    :param aggregated_bin_ht: File with aggregate counts of variants based on quantile bins
    :param bin_id: Name of bin to use in 'bin_id' column of `aggregated_bin_ht` to use to determine probability cutoff
    :return: Finalized random forest Table annotated with variant filters
    """
    # Determine SNP and indel RF cutoffs if given bin instead of RF probability

    snp_cutoff_global = hl.struct(min_score=snp_cutoff)
    indel_cutoff_global = hl.struct(min_score=indel_cutoff)

    # Add filters to RF HT
    filters = dict()

    if ht.any(hl.is_missing(ht.rf_probability["TP"])):
        raise ValueError("Missing RF probability!")

    filters["RF"] = (
        hl.is_snp(ht.alleles[0], ht.alleles[1])
        & (ht.rf_probability["TP"] < snp_cutoff_global.min_score)) | (
            ~hl.is_snp(ht.alleles[0], ht.alleles[1])
            & (ht.rf_probability["TP"] < indel_cutoff_global.min_score))

    # Fix annotations for release
    annotations_expr = {
        "rf_positive_label": hl.or_else(ht.tp, False),
        "rf_negative_label": ht.fail_hard_filters,
        "rf_probability": ht.rf_probability["TP"],
    }

    ht = ht.transmute(filters=add_filters_expr(filters=filters),
                      **annotations_expr)

    ht = ht.annotate_globals(rf_snv_cutoff=snp_cutoff_global,
                             rf_indel_cutoff=indel_cutoff_global)

    return ht
コード例 #20
0
def prepare_gene_models():
    genes_grch37 = prepare_gene_models_helper("GRCh37")
    genes_grch38 = prepare_gene_models_helper("GRCh38")

    genes_grch37 = genes_grch37.select(GRCh37=genes_grch37.row_value)
    genes_grch38 = genes_grch38.select(GRCh38=genes_grch38.row_value)

    genes = genes_grch37.join(genes_grch38, how="outer")

    # Annotate genes with information from HGNC
    hgnc_path = pipeline_config.get("reference_data", "hgnc_path")
    hgnc = load_hgnc(hgnc_path)
    genes = genes.annotate(**hgnc[genes.gene_id])
    genes = genes.annotate(
        symbol=hl.or_else(genes.symbol, hl.or_else(genes.GRCh38.gencode_gene_symbol, genes.GRCh37.gencode_gene_symbol)),
    )

    # Collect all fields that can be used to search by gene symbol
    genes = genes.annotate(
        search_terms=hl.set(
            hl.empty_array(hl.tstr)
            .append(genes.symbol)
            .extend(hl.or_else(genes.previous_symbols, hl.empty_array(hl.tstr)))
            .extend(hl.or_else(genes.alias_symbols, hl.empty_array(hl.tstr)))
            .append(genes.GRCh38.gencode_gene_symbol)
            .append(genes.GRCh37.gencode_gene_symbol)
            .filter(hl.is_defined)
            .map(lambda s: s.upper())
        ),
    )

    gnomad_constraint_path = pipeline_config.get("reference_data", "gnomad_constraint_path")
    gnomad_constraint = prepare_gnomad_constraint(gnomad_constraint_path)
    genes = genes.annotate(gnomad_constraint=gnomad_constraint[genes.GRCh37.canonical_transcript_id])

    exac_constraint_path = pipeline_config.get("reference_data", "exac_constraint_path")
    exac_constraint = prepare_exac_constraint(exac_constraint_path)
    genes = genes.annotate(exac_constraint=exac_constraint[genes.GRCh37.canonical_transcript_id])

    staging_path = pipeline_config.get("output", "staging_path")

    genes.write(f"{staging_path}/gene_models.ht", overwrite=True)
コード例 #21
0
ファイル: sparse_mt.py プロジェクト: enriquea/gnomad_hail
def compute_last_ref_block_end(mt: hl.MatrixTable) -> hl.Table:
    """
    This function takes a sparse MT and computes for each row the genomic position of the
    most upstream reference block overlapping that row.

    Note that since reference blocks do not extend beyond contig boundaries, only the position is kept.

    This function returns a Table with that annotation.  (`last_END_position`).

    :param mt: Input MatrixTable
    :return: Output Table with `last_END_position` annotation
    """
    mt = mt.select_entries("END")

    # Localize entries, so that they can be viewed as an array and scanned over using hl.scan.array_agg
    ht = mt._localize_entries("__entries", "__cols")

    # Compute the position by using hl.scan._prev_nonnull.
    # This was inspired by hl.experimental.densify
    # _prev_non_null is an aggregator that keeps the previous record in memory
    # and updates it with the given value at the row if it's not null (missing)
    # The following code computes the following annotation for each row:
    # 1. Keep a scan of the entries using _prev_nonnull, keeping the start (ht.locus) and end (entry.END) of each ref block  (1.1)
    # 2. For the current row locus, record the start of the block that starts the furthest away,
    #    that is the minimum position in the current scan for any block that overlaps the current locus (2.1)
    ht = ht.select(
        last_END_position=hl.or_else(
            hl.min(  # 2. For the current row locus, record the start of the block that starts the furthest away
                hl.scan.array_agg(
                    lambda entry: hl.scan._prev_nonnull(  # 1. Keep a scan of the entries using _prev_nonnull
                        hl.or_missing(
                            hl.is_defined(
                                entry.END
                            ),  # Update the scan whenever a new ref block is encountered
                            hl.tuple(
                                [  # 1.1 keep the start (ht.locus) and end (entry.END) of each ref block
                                    ht.locus,
                                    entry.END,
                                ]
                            ),
                        )
                    ),
                    ht.__entries,
                ).map(
                    lambda x: hl.or_missing(  # 2.1 get the start position of blocks that overlap the current locus
                        (x[1] >= ht.locus.position) & (x[0].contig == ht.locus.contig),
                        x[0].position,
                    )
                )
            ),
            ht.locus.position,
        )
    )
    return ht.select_globals()
コード例 #22
0
def load_hgnc(hgnc_path, min_partitions=8):
    hgnc = hl.import_table(hgnc_path, min_partitions=min_partitions, missing="")
    hgnc = hgnc.select(
        hgnc_id=hgnc["HGNC ID"],
        symbol=hgnc["Approved symbol"],
        name=hgnc["Approved name"],
        previous_symbols=hgnc["Previous symbols"].split(",").map(lambda s: s.strip()),
        alias_symbols=hgnc["Alias symbols"].split(",").map(lambda s: s.strip()),
        omim_id=hgnc["OMIM ID(supplied by OMIM)"],
        gene_id=hl.or_else(hgnc["Ensembl gene ID"], hgnc["Ensembl ID(supplied by Ensembl)"]),
    )
    hgnc = hgnc.filter(hl.is_defined(hgnc.gene_id)).key_by("gene_id")
    return hgnc
コード例 #23
0
ファイル: test_api.py プロジェクト: shulik7/hail
    def test(self):
        schema = hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tint32, d=hl.tint32, e=hl.tstr,
                            f=hl.tarray(hl.tint32),
                            g=hl.tarray(
                                hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr)),
                            h=hl.tstruct(a=hl.tint32, b=hl.tint32, c=hl.tstr),
                            i=hl.tbool,
                            j=hl.tstruct(x=hl.tint32, y=hl.tint32, z=hl.tstr))

        rows = [{'a': 4, 'b': 1, 'c': 3, 'd': 5,
                 'e': "hello", 'f': [1, 2, 3],
                 'g': [hl.Struct(x=1, y=5, z='banana')],
                 'h': hl.Struct(a=5, b=3, c='winter'),
                 'i': True,
                 'j': hl.Struct(x=3, y=2, z='summer')}]

        kt = hl.Table.parallelize(rows, schema)

        result = convert_struct_to_dict(kt.annotate(
            chisq=hl.chisq(kt.a, kt.b, kt.c, kt.d),
            ctt=hl.ctt(kt.a, kt.b, kt.c, kt.d, 5),
            dict=hl.dict(hl.zip([kt.a, kt.b], [kt.c, kt.d])),
            dpois=hl.dpois(4, kt.a),
            drop=kt.h.drop('b', 'c'),
            exp=hl.exp(kt.c),
            fet=hl.fisher_exact_test(kt.a, kt.b, kt.c, kt.d),
            hwe=hl.hardy_weinberg_p(1, 2, 1),
            index=hl.index(kt.g, 'z'),
            is_defined=hl.is_defined(kt.i),
            is_missing=hl.is_missing(kt.i),
            is_nan=hl.is_nan(hl.float64(kt.a)),
            json=hl.json(kt.g),
            log=hl.log(kt.a, kt.b),
            log10=hl.log10(kt.c),
            or_else=hl.or_else(kt.a, 5),
            or_missing=hl.or_missing(kt.i, kt.j),
            pchisqtail=hl.pchisqtail(kt.a, kt.b),
            pcoin=hl.rand_bool(0.5),
            pnorm=hl.pnorm(0.2),
            pow=2.0 ** kt.b,
            ppois=hl.ppois(kt.a, kt.b),
            qchisqtail=hl.qchisqtail(kt.a, kt.b),
            range=hl.range(0, 5, kt.b),
            rnorm=hl.rand_norm(0.0, kt.b),
            rpois=hl.rand_pois(kt.a),
            runif=hl.rand_unif(kt.b, kt.a),
            select=kt.h.select('c', 'b'),
            sqrt=hl.sqrt(kt.a),
            to_str=[hl.str(5), hl.str(kt.a), hl.str(kt.g)],
            where=hl.cond(kt.i, 5, 10)
        ).take(1)[0])
コード例 #24
0
def calculate_new_intervals(ht, n, reference_genome):
    """takes a table, keyed by ['locus', ...] and produces a list of intervals suitable
    for repartitioning a combiner matrix table

    Parameters
    ----------
    ht : :class:`.Table`
        Table / Rows Table to compute new intervals for
    n : :obj:`int`
        Number of rows each partition should have, (last partition may be smaller)
    reference_genome: :obj:`str` or :class:`.ReferenceGenome`, optional
        Reference genome to use.

    Returns
    -------
    :obj:`List[Interval]`
    """
    assert list(ht.key) == ['locus']
    assert ht.locus.dtype == hl.tlocus(reference_genome=reference_genome)
    end = hl.Locus(reference_genome.contigs[-1],
                   reference_genome.lengths[reference_genome.contigs[-1]],
                   reference_genome=reference_genome)

    n_rows = ht.count()

    if n_rows == 0:
        raise ValueError('empty table!')

    ht = ht.select()
    ht = ht.annotate(x=hl.scan.count())
    ht = ht.annotate(y=ht.x + 1)
    ht = ht.filter((ht.x // n != ht.y // n) | (ht.x == (n_rows - 1)))
    ht = ht.select()
    ht = ht.annotate(start=hl.or_else(
        hl.scan._prev_nonnull(
            hl.locus_from_global_position(ht.locus.global_position() + 1,
                                          reference_genome=reference_genome)),
        hl.locus_from_global_position(0, reference_genome=reference_genome)))
    ht = ht.key_by()
    ht = ht.select(
        interval=hl.interval(start=ht.start, end=ht.locus, includes_end=True))

    intervals = ht.aggregate(hl.agg.collect(ht.interval))

    last_st = hl.eval(
        hl.locus_from_global_position(
            hl.literal(intervals[-1].end).global_position() + 1,
            reference_genome=reference_genome))
    interval = hl.Interval(start=last_st, end=end, includes_end=True)
    intervals.append(interval)
    return intervals
コード例 #25
0
def main(args):
    hl.init(master=f'local[{args.n_threads}]',
            log=hl.utils.timestamp_path(os.path.join(tempfile.gettempdir(), 'extract_vcf'), suffix='.log'),
            default_reference=args.reference)

    sys.path.append('/')
    add_args = []
    if args.additional_args is not None:
        add_args = args.additional_args.split(',')
    load_module = importlib.import_module(args.load_module)
    mt = getattr(load_module, args.load_mt_function)(*add_args)

    if args.gene_map_ht_path is None:
        interval = [hl.parse_locus_interval(args.interval)]
    else:
        gene_ht = hl.read_table(args.gene_map_ht_path)
        if args.gene is not None:
            gene_ht = gene_ht.filter(gene_ht.gene_symbol == args.gene)
            interval = gene_ht.aggregate(hl.agg.take(gene_ht.interval, 1), _localize=False)
        else:
            interval = [hl.parse_locus_interval(args.interval)]
            gene_ht = hl.filter_intervals(gene_ht, interval)

        gene_ht = gene_ht.filter(hl.set(args.groups.split(',')).contains(gene_ht.annotation))
        gene_ht.select(group=gene_ht.gene_id + '_' + gene_ht.gene_symbol + '_' + gene_ht.annotation, variant=hl.delimit(gene_ht.variants, '\t')
                       ).key_by().drop('start').export(args.group_output_file, header=False)
        # TODO: possible minor optimization: filter output VCF to only variants in `gene_ht.variants`

    if not args.no_adj:
        mt = mt.filter_entries(mt.adj)

    mt = hl.filter_intervals(mt, interval)

    if not args.input_bgen:
        mt = mt.select_entries('GT')
        mt = mt.filter_rows(hl.agg.count_where(mt.GT.is_non_ref()) > 0)
    mt = mt.annotate_rows(rsid=mt.locus.contig + ':' + hl.str(mt.locus.position) + '_' + mt.alleles[0] + '/' + mt.alleles[1])

    if args.callrate_filter:
        mt = mt.filter_rows(hl.agg.fraction(hl.is_defined(mt.GT)) >= args.callrate_filter)

    if args.export_bgen:
        if not args.input_bgen:
            mt = mt.annotate_entries(GT=hl.if_else(mt.GT.is_haploid(), hl.call(mt.GT[0], mt.GT[0]), mt.GT))
            mt = gt_to_gp(mt)
            mt = impute_missing_gp(mt, mean_impute=args.mean_impute_missing)
        hl.export_bgen(mt, args.output_file, gp=mt.GP, varid=mt.rsid)
    else:
        mt = mt.annotate_entries(GT=hl.or_else(mt.GT, hl.call(0, 0)))
        # Note: no mean-imputation for VCF
        hl.export_vcf(mt, args.output_file)
コード例 #26
0
ファイル: sparse_split_multi.py プロジェクト: jigold/hail
            def with_pl(pl):
                new_exprs = {}
                dropped_fields = ['LA']
                if 'LGT' in fields:
                    new_exprs['GT'] = hl.downcode(old_entry.LGT, hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LGT')
                if 'LPGT' in fields:
                    new_exprs['PGT'] = hl.downcode(old_entry.LPGT, hl.or_else(local_a_index, hl.len(old_entry.LA)))
                    dropped_fields.append('LPGT')
                if 'LAD' in fields:
                    new_exprs['AD'] = hl.or_missing(
                        hl.is_defined(old_entry.LAD),
                        [old_entry.LAD[0], hl.or_else(old_entry.LAD[local_a_index], 0)]) # second entry zeroed for lack of non-ref AD
                    dropped_fields.append('LAD')
                if 'LPL' in fields:
                    new_exprs['PL'] = pl
                    if 'GQ' in fields:
                        new_exprs['GQ'] = hl.or_else(hl.gq_from_pl(pl), old_entry.GQ)

                    dropped_fields.append('LPL')

                return hl.cond(hl.len(ds.alleles) == 1,
                                   old_entry.annotate(**{f[1:]: old_entry[f] for f in ['LGT', 'LPGT', 'LAD', 'LPL'] if f in fields}).drop(*dropped_fields),
                                   old_entry.annotate(**new_exprs).drop(*dropped_fields))
コード例 #27
0
def project_pcs_relateds(mt_ldpruned, mt, covar_pc_num):
    """
    Tales LD pruned matrix table, calculates PCs, and projects those PCs back to related individuals included in mt
    :param mt_ldpruned: matrix table with relatives removed, maf and ld pruned
    :param mt: matrix table with relatives included
    :param covar_pc_num: Number of principal components as covariates to calculate
    :return: returns matrix table with relatives, with PCs annotated
    """
    logging.info('Calculating principal components, annotating main dataset.')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt_ldpruned.GT, k=covar_pc_num, compute_loadings=True)

    # Project PCs to related individuals
    # mt of related individuals only, not pop outliers or failing samples QC
    related_mt = mt.filter_cols(
        (mt.related_to_remove == True) & (mt.pop_outlier_sample == False) &
        (hl.len(mt.failing_samples_qc) == 0),
        keep=True)
    mt_ldpruned = mt_ldpruned.annotate_rows(
        pca_af=hl.agg.mean(mt_ldpruned.GT.n_alt_alleles()) / 2)
    mtrows = mt_ldpruned.rows()
    loadings = loadings.annotate(pca_af=mtrows[loadings.locus,
                                               loadings.alleles].pca_af)
    related_scores = pc_project(related_mt, loadings)

    # Add pcs as annotations to main table
    mt = mt.annotate_cols(**{
        'pc' + str(k + 1): scores[mt.s].scores[k]
        for k in range(covar_pc_num)
    })
    # Explanation: for k principal components in range 0 to covar_pc_num-1,
    # make pc k+1 (to start at pc1 instead of pc0) be the corresponding score (keyed by mt.s) from the table scores

    # Add pcs for related individuals
    mt = mt.annotate_cols(
        **{
            'pc' +
            str(k + 1): hl.or_else(mt['pc' +
                                      str(k +
                                          1)], related_scores[mt.s].scores[k])
            for k in range(covar_pc_num)
        })
    # Explanation: for k principal components in range from 0 to (covar_pc_num-1)
    # give either the existing pcX, or if missing give the corresponding score (keyed by mt.s)
    # from the table related_scores

    return mt
コード例 #28
0
def _collect_scatter_plot_data(x: hl.expr.NumericExpression,
                               y: hl.expr.NumericExpression,
                               fields: Dict[str, hl.expr.Expression] = None,
                               n_divisions: int = None,
                               missing_label: str = 'NA') -> pd.DataFrame:

    expressions = dict()
    if fields is not None:
        expressions.update({
            k: hl.or_else(v, missing_label) if isinstance(
                v, hl.expr.StringExpression) else v
            for k, v in fields.items()
        })

    if n_divisions is None:
        collect_expr = hl.struct(_x=x, _y=y, **expressions)
        plot_data = [
            point for point in collect_expr.collect()
            if point._x is not None and point._y is not None
        ]
        source_pd = pd.DataFrame(plot_data)
    else:
        if not all(
                isinstance(v, hl.expr.StringExpression)
                for v in expressions.values()):
            print(
                "WARN: only string expressions are supported with `n_divisions` options at this time. Converting to String"
            )
            expressions = {
                k:
                hl.str(v) if not isinstance(v, hl.expr.StringExpression) else v
                for k, v in expressions.items()
            }
        agg_f = x._aggregation_method()
        res = agg_f(
            hl.agg.downsample(
                x,
                y,
                label=list(expressions.values()) if expressions else None,
                n_divisions=n_divisions))
        source_pd = pd.DataFrame([
            dict(_x=point[0], _y=point[1], **dict(zip(expressions, point[2])))
            for point in res
        ])

    return source_pd
コード例 #29
0
def get_expr_for_vep_gene_ids_set(vep_transcript_consequences_root,
                                  only_coding_genes=False):
    """Expression to compute the set of gene ids in VEP annotations for this variant.

    Args:
        vep_transcript_consequences_root (ArrayExpression): VEP transcript_consequences root in the struct
        only_coding_genes (bool): If set to True, non-coding genes will be excluded.
    Return:
        SetExpression: expression
    """

    expr = vep_transcript_consequences_root

    if only_coding_genes:
        expr = expr.filter(
            lambda c: hl.or_else(c.biotype, "") == "protein_coding")

    return hl.set(expr.map(lambda c: c.gene_id))
コード例 #30
0
ファイル: create_fam.py プロジェクト: edenkal13/gnomad_qc
def run_infer_families() -> hl.Pedigree:
    logger.info("Inferring families")
    ped = infer_families(get_relatedness_annotated_ht(), sex.ht(),
                         duplicates.ht())

    # Remove all trios containing any QC-filtered sample
    meta_ht = meta.ht()
    filtered_samples = meta_ht.aggregate(
        hl.agg.filter(
            (hl.len(meta_ht.qc_metrics_filters) > 0)
            | hl.or_else(hl.len(meta_ht.hard_filters) > 0, False),
            hl.agg.collect_as_set(meta_ht.s),
        ))

    return hl.Pedigree(trios=[
        trio for trio in ped.trios
        if trio.s not in filtered_samples and trio.pat_id not in
        filtered_samples and trio.mat_id not in filtered_samples
    ])
コード例 #31
0
def all_and_leave_one_out(x,
                          pop_array,
                          all_f=hl.sum,
                          loo_f=lambda i, x: hl.sum(x) - hl.or_else(x[i], 0)):
    """
    Applies a function to an input array for all populations, and for each of leave-one-out populations.

    :param x: Input array
    :param pop_array: Population array
    :param all_f: Function for all populations. It takes the input array and returns a new value
    :param loo_f: Function for each of leave-one-out populations. It takes an index of leave-one-out
                  population and the input array, and returns an array of new values.
    ...
    :return: Array of new values for all populations and for each of leave-one-out populations.
    :rtype: ArrayExpression
    """
    arr = hl.array([all_f(x)])
    arr = arr.extend(hl.map(lambda i: loo_f(i, x),
                            hl.range(hl.len(pop_array))))
    return hl.or_missing(hl.any(hl.is_defined, x), arr)
コード例 #32
0
def annotate_relateds(mt, relateds_to_remove_file):
    """
    Annotates a matrix table, given a list of individuals (single column, no header) that are related to remove from
    subsequent analysis. Does not remove the individuals- simply marks them as 'True' in the variable
    'related_to_remove'

    :param mt: matrix table to annotate
    :param relateds_to_remove_file: file containing IDs of individuals to remove from analyses needing independent cases
    :return: returns annotated matrix table with new column variable 'related_to_remove'
    """
    # Import list of related individuals to remove, generated by python code/networkx
    relatives = hl.import_table(relateds_to_remove_file, no_header=True)
    relatives = relatives.annotate(related_to_remove=True)
    relatives = relatives.key_by('f0')

    # Annotate matrix table with relatives to remove
    mt = mt.annotate_cols(related_to_remove=relatives[mt.s].related_to_remove)
    mt = mt.annotate_cols(
        related_to_remove=hl.or_else(mt.related_to_remove, False))

    return mt
コード例 #33
0
def liftover_annotations(gnomad_37_path, gnomad_38_path,
                         annotated_gnomad_38_path):
    """
    The 38 liftover of gnomAD is stripped of all global and row annotations. This function
    annotates the 38 liftover with the original 37 annotations as the combined reference
    data script needs them.
    :param gnomad_37_path: path to 37 version of gnomAD for data type
    :param gnomad_38_path: path to 38 version of gnomAD for data type
    :param annotated_gnomad_38_path: path to annotated 38 version of gnomAD for data type
    :return:
    """
    ht_37 = hl.read_table(gnomad_37_path)
    ht_38 = hl.read_table(gnomad_38_path)
    ht_38 = ht_38.annotate(
        original_alleles=hl.or_else(ht_38.original_alleles, ht_38.alleles))
    ht_38 = ht_38.key_by('original_locus', 'original_alleles')
    ht_38 = ht_38.annotate(**ht_37[ht_38.key])
    ht_38 = ht_38.annotate_globals(**ht_37.index_globals())
    ht_38 = ht_38.key_by('locus', 'alleles')
    ht_38.write(annotated_gnomad_38_path, overwrite=True)
    return ht_38
コード例 #34
0
ファイル: plots.py プロジェクト: jigold/hail
def _collect_scatter_plot_data(
        x: Tuple[str, NumericExpression],
        y: Tuple[str, NumericExpression],
        fields: Dict[str, Expression] = None,
        n_divisions: int = None,
        missing_label: str =  'NA'
) -> pd.DataFrame:

    expressions = dict()
    if fields is not None:
        expressions.update({k: hail.or_else(v, missing_label) if isinstance(v, StringExpression) else v for k, v in fields.items()})

    if n_divisions is None:
        collect_expr = hail.struct(**dict((k,v) for k,v in (x,y)), **expressions)
        plot_data = [point for point in collect_expr.collect() if point[x[0]] is not None and point[y[0]] is not None]
        source_pd = pd.DataFrame(plot_data)
    else:
        # FIXME: remove the type conversion logic if/when downsample supports continuous values for labels
        # Save all numeric types to cast in DataFrame
        numeric_expr = {k: 'int32' for k,v in expressions.items() if isinstance(v, Int32Expression)}
        numeric_expr.update({k: 'int64' for k,v in expressions.items() if isinstance(v, Int64Expression)})
        numeric_expr.update({k: 'float32' for k, v in expressions.items() if isinstance(v, Float32Expression)})
        numeric_expr.update({k: 'float64' for k, v in expressions.items() if isinstance(v, Float64Expression)})

        # Cast non-string types to string
        expressions = {k: hail.str(v) if not isinstance(v, StringExpression) else v for k,v in expressions.items()}

        agg_f = x[1]._aggregation_method()
        res = agg_f(hail.agg.downsample(x[1], y[1], label=list(expressions.values()) if expressions else None, n_divisions=n_divisions))
        source_pd = pd.DataFrame([
            dict(
                **{x[0]: point[0], y[0]: point[1]},
                **(dict(zip(expressions, point[2])) if point[2] is not None else {})
            ) for point in res
        ])
        source_pd = source_pd.astype(numeric_expr, copy=False)

    return source_pd
コード例 #35
0
ファイル: vcf_combiner.py プロジェクト: jigold/hail
def reannotate(mt, gatk_ht, summ_ht):
    """Re-annotate a sparse MT with annotations from certain GATK tools

    `gatk_ht` should be a table from the rows of a VCF, with `info` having at least
    the following fields.  Be aware that fields not present in this list will
    be dropped.
    ```
        struct {
            AC: array<int32>,
            AF: array<float64>,
            AN: int32,
            BaseQRankSum: float64,
            ClippingRankSum: float64,
            DP: int32,
            FS: float64,
            MQ: float64,
            MQRankSum: float64,
            MQ_DP: int32,
            NEGATIVE_TRAIN_SITE: bool,
            POSITIVE_TRAIN_SITE: bool,
            QD: float64,
            QUALapprox: int32,
            RAW_MQ: float64,
            ReadPosRankSum: float64,
            SB_TABLE: array<int32>,
            SOR: float64,
            VQSLOD: float64,
            VarDP: int32,
            culprit: str
        }
    ```
    `summarize_ht` should be the output of :func:`.summarize` as a rows table.

    Note
    ----
    You will not be able to run :func:`.combine_gvcfs` with the output of this
    function.
    """
    def check(ht):
        keys = list(ht.key)
        if keys[0] != 'locus':
            raise TypeError(f'table inputs must have first key "locus", found {keys}')
        if keys != ['locus']:
            return hl.Table(TableKeyBy(ht._tir, ['locus'], is_sorted=True))
        return ht

    gatk_ht, summ_ht = [check(ht) for ht in (gatk_ht, summ_ht)]
    return mt.annotate_rows(
        info=hl.rbind(
            gatk_ht[mt.locus].info, summ_ht[mt.locus].info,
            lambda ginfo, hinfo: hl.struct(
                AC=hl.or_else(hinfo.AC, ginfo.AC),
                AF=hl.or_else(hinfo.AF, ginfo.AF),
                AN=hl.or_else(hinfo.AN, ginfo.AN),
                BaseQRankSum=hl.or_else(hinfo.BaseQRankSum, ginfo.BaseQRankSum),
                ClippingRankSum=hl.or_else(hinfo.ClippingRankSum, ginfo.ClippingRankSum),
                DP=hl.or_else(hinfo.DP, ginfo.DP),
                FS=ginfo.FS,
                MQ=hl.or_else(hinfo.MQ, ginfo.MQ),
                MQRankSum=hl.or_else(hinfo.MQRankSum, ginfo.MQRankSum),
                MQ_DP=hl.or_else(hinfo.MQ_DP, ginfo.MQ_DP),
                NEGATIVE_TRAIN_SITE=ginfo.NEGATIVE_TRAIN_SITE,
                POSITIVE_TRAIN_SITE=ginfo.POSITIVE_TRAIN_SITE,
                QD=ginfo.QD,
                QUALapprox=hl.or_else(hinfo.QUALapprox, ginfo.QUALapprox),
                RAW_MQ=hl.or_else(hinfo.RAW_MQ, ginfo.RAW_MQ),
                ReadPosRankSum=hl.or_else(hinfo.ReadPosRankSum, ginfo.ReadPosRankSum),
                SB_TABLE=hl.or_else(hinfo.SB_TABLE, ginfo.SB_TABLE),
                SOR=ginfo.SOR,
                VQSLOD=ginfo.VQSLOD,
                VarDP=hl.or_else(hinfo.VarDP, ginfo.VarDP),
                culprit=ginfo.culprit,
            )),
        qual=gatk_ht[mt.locus].qual,
        filters=gatk_ht[mt.locus].filters,
    )
コード例 #36
0
ファイル: ld_score_regression.py プロジェクト: jigold/hail
def ld_score_regression(weight_expr,
                        ld_score_expr,
                        chi_sq_exprs,
                        n_samples_exprs,
                        n_blocks=200,
                        two_step_threshold=30,
                        n_reference_panel_variants=None) -> Table:
    r"""Estimate SNP-heritability and level of confounding biases from
    GWAS summary statistics.

    Given a set or multiple sets of genome-wide association study (GWAS)
    summary statistics, :func:`.ld_score_regression` estimates the heritability
    of a trait or set of traits and the level of confounding biases present in
    the underlying studies by regressing chi-squared statistics on LD scores,
    leveraging the model:

    .. math::

        \mathrm{E}[\chi_j^2] = 1 + Na + \frac{Nh_g^2}{M}l_j

    *  :math:`\mathrm{E}[\chi_j^2]` is the expected chi-squared statistic
       for variant :math:`j` resulting from a test of association between
       variant :math:`j` and a trait.
    *  :math:`l_j = \sum_{k} r_{jk}^2` is the LD score of variant
       :math:`j`, calculated as the sum of squared correlation coefficients
       between variant :math:`j` and nearby variants. See :func:`ld_score`
       for further details.
    *  :math:`a` captures the contribution of confounding biases, such as
       cryptic relatedness and uncontrolled population structure, to the
       association test statistic.
    *  :math:`h_g^2` is the SNP-heritability, or the proportion of variation
       in the trait explained by the effects of variants included in the
       regression model above.
    *  :math:`M` is the number of variants used to estimate :math:`h_g^2`.
    *  :math:`N` is the number of samples in the underlying association study.

    For more details on the method implemented in this function, see:

    * `LD Score regression distinguishes confounding from polygenicity in genome-wide association studies (Bulik-Sullivan et al, 2015) <https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4495769/>`__

    Examples
    --------

    Run the method on a matrix table of summary statistics, where the rows
    are variants and the columns are different phenotypes:

    >>> mt_gwas = hl.read_matrix_table('data/ld_score_regression.sumstats.mt')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=mt_gwas['ld_score'],
    ...     ld_score_expr=mt_gwas['ld_score'],
    ...     chi_sq_exprs=mt_gwas['chi_squared'],
    ...     n_samples_exprs=mt_gwas['n'])


    Run the method on a table with summary statistics for a single
    phenotype:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=ht_gwas['chi_squared_50_irnt'],
    ...     n_samples_exprs=ht_gwas['n_50_irnt'])

    Run the method on a table with summary statistics for multiple
    phenotypes:

    >>> ht_gwas = hl.read_table('data/ld_score_regression.sumstats.ht')
    >>> ht_results = hl.experimental.ld_score_regression(
    ...     weight_expr=ht_gwas['ld_score'],
    ...     ld_score_expr=ht_gwas['ld_score'],
    ...     chi_sq_exprs=[ht_gwas['chi_squared_50_irnt'],
    ...                        ht_gwas['chi_squared_20160']],
    ...     n_samples_exprs=[ht_gwas['n_50_irnt'],
    ...                      ht_gwas['n_20160']])

    Notes
    -----
    The ``exprs`` provided as arguments to :func:`.ld_score_regression`
    must all be from the same object, either a :class:`Table` or a
    :class:`MatrixTable`.

    **If the arguments originate from a table:**

    *  The table must be keyed by fields ``locus`` of type
       :class:`.tlocus` and ``alleles``, a :py:data:`.tarray` of
       :py:data:`.tstr` elements.
    *  ``weight_expr``, ``ld_score_expr``, ``chi_sq_exprs``, and
       ``n_samples_exprs`` are must be row-indexed fields.
    *  The number of expressions passed to ``n_samples_exprs`` must be
       equal to one or the number of expressions passed to
       ``chi_sq_exprs``. If just one expression is passed to
       ``n_samples_exprs``, that sample size expression is assumed to
       apply to all sets of statistics passed to ``chi_sq_exprs``.
       Otherwise, the expressions passed to ``chi_sq_exprs`` and
       ``n_samples_exprs`` are matched by index.
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have generic :obj:`int` values
       ``0``, ``1``, etc. corresponding to the ``0th``, ``1st``, etc.
       expressions passed to the ``chi_sq_exprs`` argument.

    **If the arguments originate from a matrix table:**

    *  The dimensions of the matrix table must be variants
       (rows) by phenotypes (columns).
    *  The rows of the matrix table must be keyed by fields
       ``locus`` of type :class:`.tlocus` and ``alleles``,
       a :py:data:`.tarray` of :py:data:`.tstr` elements.
    *  The columns of the matrix table must be keyed by a field
       of type :py:data:`.tstr` that uniquely identifies phenotypes
       represented in the matrix table. The column key must be a single
       expression; compound keys are not accepted.
    *  ``weight_expr`` and ``ld_score_expr`` must be row-indexed
       fields.
    *  ``chi_sq_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  ``n_samples_exprs`` must be a single entry-indexed field
       (not a list of fields).
    *  The ``phenotype`` field that keys the table returned by
       :func:`.ld_score_regression` will have values corresponding to the
       column keys of the input matrix table.

    This function returns a :class:`Table` with one row per set of summary
    statistics passed to the ``chi_sq_exprs`` argument. The following
    row-indexed fields are included in the table:

    *  **phenotype** (:py:data:`.tstr`) -- The name of the phenotype. The
       returned table is keyed by this field. See the notes below for
       details on the possible values of this field.
    *  **mean_chi_sq** (:py:data:`.tfloat64`) -- The mean chi-squared
       test statistic for the given phenotype.
    *  **intercept** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          intercept :math:`1 + Na`.
       -  **standard_error**  (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    *  **snp_heritability** (`Struct`) -- Contains fields:

       -  **estimate** (:py:data:`.tfloat64`) -- A point estimate of the
          SNP-heritability :math:`h_g^2`.
       -  **standard_error** (:py:data:`.tfloat64`) -- An estimate of
          the standard error of this point estimate.

    Warning
    -------
    :func:`.ld_score_regression` considers only the rows for which both row
    fields ``weight_expr`` and ``ld_score_expr`` are defined. Rows with missing
    values in either field are removed prior to fitting the LD score
    regression model.

    Parameters
    ----------
    weight_expr : :class:`.Float64Expression`
                  Row-indexed expression for the LD scores used to derive
                  variant weights in the model.
    ld_score_expr : :class:`.Float64Expression`
                    Row-indexed expression for the LD scores used as covariates
                    in the model.
    chi_sq_exprs : :class:`.Float64Expression` or :obj:`list` of
                        :class:`.Float64Expression`
                        One or more row-indexed (if table) or entry-indexed
                        (if matrix table) expressions for chi-squared
                        statistics resulting from genome-wide association
                        studies.
    n_samples_exprs: :class:`.NumericExpression` or :obj:`list` of
                     :class:`.NumericExpression`
                     One or more row-indexed (if table) or entry-indexed
                     (if matrix table) expressions indicating the number of
                     samples used in the studies that generated the test
                     statistics supplied to ``chi_sq_exprs``.
    n_blocks : :obj:`int`
               The number of blocks used in the jackknife approach to
               estimating standard errors.
    two_step_threshold : :obj:`int`
                         Variants with chi-squared statistics greater than this
                         value are excluded in the first step of the two-step
                         procedure used to fit the model.
    n_reference_panel_variants : :obj:`int`, optional
                                 Number of variants used to estimate the
                                 SNP-heritability :math:`h_g^2`.

    Returns
    -------
    :class:`.Table`
        Table keyed by ``phenotype`` with intercept and heritability estimates
        for each phenotype passed to the function."""

    chi_sq_exprs = wrap_to_list(chi_sq_exprs)
    n_samples_exprs = wrap_to_list(n_samples_exprs)

    assert ((len(chi_sq_exprs) == len(n_samples_exprs)) or
            (len(n_samples_exprs) == 1))
    __k = 2  # number of covariates, including intercept

    ds = chi_sq_exprs[0]._indices.source

    analyze('ld_score_regression/weight_expr',
            weight_expr,
            ds._row_indices)
    analyze('ld_score_regression/ld_score_expr',
            ld_score_expr,
            ds._row_indices)

    # format input dataset
    if isinstance(ds, MatrixTable):
        if len(chi_sq_exprs) != 1:
            raise ValueError("""Only one chi_sq_expr allowed if originating
                from a matrix table.""")
        if len(n_samples_exprs) != 1:
            raise ValueError("""Only one n_samples_expr allowed if
                originating from a matrix table.""")

        col_key = list(ds.col_key)
        if len(col_key) != 1:
            raise ValueError("""Matrix table must be keyed by a single
                phenotype field.""")

        analyze('ld_score_regression/chi_squared_expr',
                chi_sq_exprs[0],
                ds._entry_indices)
        analyze('ld_score_regression/n_samples_expr',
                n_samples_exprs[0],
                ds._entry_indices)

        ds = ds._select_all(row_exprs={'__locus': ds.locus,
                                       '__alleles': ds.alleles,
                                       '__w_initial': weight_expr,
                                       '__w_initial_floor': hl.max(weight_expr,
                                                                   1.0),
                                       '__x': ld_score_expr,
                                       '__x_floor': hl.max(ld_score_expr,
                                                           1.0)},
                            row_key=['__locus', '__alleles'],
                            col_exprs={'__y_name': ds[col_key[0]]},
                            col_key=['__y_name'],
                            entry_exprs={'__y': chi_sq_exprs[0],
                                         '__n': n_samples_exprs[0]})
        ds = ds.annotate_entries(**{'__w': ds.__w_initial})

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    else:
        assert isinstance(ds, Table)
        for y in chi_sq_exprs:
            analyze('ld_score_regression/chi_squared_expr', y, ds._row_indices)
        for n in n_samples_exprs:
            analyze('ld_score_regression/n_samples_expr', n, ds._row_indices)

        ys = ['__y{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ws = ['__w{:}'.format(i) for i, _ in enumerate(chi_sq_exprs)]
        ns = ['__n{:}'.format(i) for i, _ in enumerate(n_samples_exprs)]

        ds = ds.select(**dict(**{'__locus': ds.locus,
                                 '__alleles': ds.alleles,
                                 '__w_initial': weight_expr,
                                 '__x': ld_score_expr},
                              **{y: chi_sq_exprs[i]
                                 for i, y in enumerate(ys)},
                              **{w: weight_expr for w in ws},
                              **{n: n_samples_exprs[i]
                                 for i, n in enumerate(ns)}))
        ds = ds.key_by(ds.__locus, ds.__alleles)

        table_tmp_file = new_temp_file()
        ds.write(table_tmp_file)
        ds = hl.read_table(table_tmp_file)

        hts = [ds.select(**{'__w_initial': ds.__w_initial,
                            '__w_initial_floor': hl.max(ds.__w_initial,
                                                        1.0),
                            '__x': ds.__x,
                            '__x_floor': hl.max(ds.__x, 1.0),
                            '__y_name': i,
                            '__y': ds[ys[i]],
                            '__w': ds[ws[i]],
                            '__n': hl.int(ds[ns[i]])})
               for i, y in enumerate(ys)]

        mts = [ht.to_matrix_table(row_key=['__locus',
                                           '__alleles'],
                                  col_key=['__y_name'],
                                  row_fields=['__w_initial',
                                              '__w_initial_floor',
                                              '__x',
                                              '__x_floor'])
               for ht in hts]

        ds = mts[0]
        for i in range(1, len(ys)):
            ds = ds.union_cols(mts[i])

        ds = ds.filter_rows(hl.is_defined(ds.__locus) &
                            hl.is_defined(ds.__alleles) &
                            hl.is_defined(ds.__w_initial) &
                            hl.is_defined(ds.__x))

    mt_tmp_file1 = new_temp_file()
    ds.write(mt_tmp_file1)
    mt = hl.read_matrix_table(mt_tmp_file1)

    if not n_reference_panel_variants:
        M = mt.count_rows()
    else:
        M = n_reference_panel_variants

    # block variants for each phenotype
    n_phenotypes = mt.count_cols()

    mt = mt.annotate_entries(__in_step1=(hl.is_defined(mt.__y) &
                                         (mt.__y < two_step_threshold)),
                             __in_step2=hl.is_defined(mt.__y))

    mt = mt.annotate_cols(__col_idx=hl.int(hl.scan.count()),
                          __m_step1=hl.agg.count_where(mt.__in_step1),
                          __m_step2=hl.agg.count_where(mt.__in_step2))

    col_keys = list(mt.col_key)

    ht = mt.localize_entries(entries_array_field_name='__entries',
                             columns_array_field_name='__cols')

    ht = ht.annotate(__entries=hl.rbind(
        hl.scan.array_agg(
            lambda entry: hl.scan.count_where(entry.__in_step1),
            ht.__entries),
        lambda step1_indices: hl.map(
            lambda i: hl.rbind(
                hl.int(hl.or_else(step1_indices[i], 0)),
                ht.__cols[i].__m_step1,
                ht.__entries[i],
                lambda step1_idx, m_step1, entry: hl.rbind(
                    hl.map(
                        lambda j: hl.int(hl.floor(j * (m_step1 / n_blocks))),
                        hl.range(0, n_blocks + 1)),
                    lambda step1_separators: hl.rbind(
                        hl.set(step1_separators).contains(step1_idx),
                        hl.sum(
                            hl.map(
                                lambda s1: step1_idx >= s1,
                                step1_separators)) - 1,
                        lambda is_separator, step1_block: entry.annotate(
                            __step1_block=step1_block,
                            __step2_block=hl.cond(~entry.__in_step1 & is_separator,
                                                  step1_block - 1,
                                                  step1_block))))),
            hl.range(0, hl.len(ht.__entries)))))

    mt = ht._unlocalize_entries('__entries', '__cols', col_keys)

    mt_tmp_file2 = new_temp_file()
    mt.write(mt_tmp_file2)
    mt = hl.read_matrix_table(mt_tmp_file2)
    
    # initial coefficient estimates
    mt = mt.annotate_cols(__initial_betas=[
        1.0, (hl.agg.mean(mt.__y) - 1.0) / hl.agg.mean(mt.__x)])
    mt = mt.annotate_cols(__step1_betas=mt.__initial_betas,
                          __step2_betas=mt.__initial_betas)

    # step 1 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step1,
            1.0/(mt.__w_initial_floor * 2.0 * (mt.__step1_betas[0] +
                                               mt.__step1_betas[1] *
                                               mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step1_betas=hl.agg.filter(
            mt.__in_step1,
            hl.agg.linreg(y=mt.__y,
                          x=[1.0, mt.__x],
                          weight=mt.__w).beta))
        mt = mt.annotate_cols(__step1_h2=hl.max(hl.min(
            mt.__step1_betas[1] * M / hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step1_betas=[
            mt.__step1_betas[0],
            mt.__step1_h2 * hl.agg.mean(mt.__n) / M])

    # step 1 block jackknife
    mt = mt.annotate_cols(__step1_block_betas=[
        hl.agg.filter((mt.__step1_block != i) & mt.__in_step1,
                      hl.agg.linreg(y=mt.__y,
                                    x=[1.0, mt.__x],
                                    weight=mt.__w).beta)
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step1_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step1_betas - (n_blocks - 1) * x,
        mt.__step1_block_betas))

    mt = mt.annotate_cols(
        __step1_jackknife_mean=hl.map(
            lambda i: hl.mean(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected)),
            hl.range(0, __k)),
        __step1_jackknife_variance=hl.map(
            lambda i: (hl.sum(
                hl.map(lambda x: x[i]**2,
                       mt.__step1_block_betas_bias_corrected)) -
                       hl.sum(
                hl.map(lambda x: x[i],
                       mt.__step1_block_betas_bias_corrected))**2 /
                       n_blocks) /
            (n_blocks - 1) / n_blocks,
            hl.range(0, __k)))

    # step 2 iteratively reweighted least squares
    for i in range(3):
        mt = mt.annotate_entries(__w=hl.cond(
            mt.__in_step2,
            1.0/(mt.__w_initial_floor *
                 2.0 * (mt.__step2_betas[0] +
                        mt.__step2_betas[1] *
                        mt.__x_floor)**2),
            0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            hl.agg.filter(mt.__in_step2,
                          hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                        x=[mt.__x],
                                        weight=mt.__w).beta[0])])
        mt = mt.annotate_cols(__step2_h2=hl.max(hl.min(
            mt.__step2_betas[1] * M/hl.agg.mean(mt.__n), 1.0), 0.0))
        mt = mt.annotate_cols(__step2_betas=[
            mt.__step1_betas[0],
            mt.__step2_h2 * hl.agg.mean(mt.__n)/M])

    # step 2 block jackknife
    mt = mt.annotate_cols(__step2_block_betas=[
        hl.agg.filter((mt.__step2_block != i) & mt.__in_step2,
                      hl.agg.linreg(y=mt.__y - mt.__step1_betas[0],
                                    x=[mt.__x],
                                    weight=mt.__w).beta[0])
        for i in range(n_blocks)])

    mt = mt.annotate_cols(__step2_block_betas_bias_corrected=hl.map(
        lambda x: n_blocks * mt.__step2_betas[1] - (n_blocks - 1) * x,
        mt.__step2_block_betas))

    mt = mt.annotate_cols(
        __step2_jackknife_mean=hl.mean(
            mt.__step2_block_betas_bias_corrected),
        __step2_jackknife_variance=(
            hl.sum(mt.__step2_block_betas_bias_corrected**2) -
            hl.sum(mt.__step2_block_betas_bias_corrected)**2 /
            n_blocks) / (n_blocks - 1) / n_blocks)

    # combine step 1 and step 2 block jackknifes
    mt = mt.annotate_entries(
        __step2_initial_w=1.0/(mt.__w_initial_floor *
                               2.0 * (mt.__initial_betas[0] +
                                      mt.__initial_betas[1] *
                                      mt.__x_floor)**2))

    mt = mt.annotate_cols(
        __final_betas=[
            mt.__step1_betas[0],
            mt.__step2_betas[1]],
        __c=(hl.agg.sum(mt.__step2_initial_w * mt.__x) /
             hl.agg.sum(mt.__step2_initial_w * mt.__x**2)))

    mt = mt.annotate_cols(__final_block_betas=hl.map(
        lambda i: (mt.__step2_block_betas[i] - mt.__c *
                   (mt.__step1_block_betas[i][0] - mt.__final_betas[0])),
        hl.range(0, n_blocks)))

    mt = mt.annotate_cols(
        __final_block_betas_bias_corrected=(n_blocks * mt.__final_betas[1] -
                                            (n_blocks - 1) *
                                            mt.__final_block_betas))

    mt = mt.annotate_cols(
        __final_jackknife_mean=[
            mt.__step1_jackknife_mean[0],
            hl.mean(mt.__final_block_betas_bias_corrected)],
        __final_jackknife_variance=[
            mt.__step1_jackknife_variance[0],
            (hl.sum(mt.__final_block_betas_bias_corrected**2) -
             hl.sum(mt.__final_block_betas_bias_corrected)**2 /
             n_blocks) / (n_blocks - 1) / n_blocks])

    # convert coefficient to heritability estimate
    mt = mt.annotate_cols(
        phenotype=mt.__y_name,
        mean_chi_sq=hl.agg.mean(mt.__y),
        intercept=hl.struct(
            estimate=mt.__final_betas[0],
            standard_error=hl.sqrt(mt.__final_jackknife_variance[0])),
        snp_heritability=hl.struct(
            estimate=(M/hl.agg.mean(mt.__n)) * mt.__final_betas[1],
            standard_error=hl.sqrt((M/hl.agg.mean(mt.__n))**2 *
                                   mt.__final_jackknife_variance[1])))

    # format and return results
    ht = mt.cols()
    ht = ht.key_by(ht.phenotype)
    ht = ht.select(ht.mean_chi_sq,
                   ht.intercept,
                   ht.snp_heritability)

    ht_tmp_file = new_temp_file()
    ht.write(ht_tmp_file)
    ht = hl.read_table(ht_tmp_file)
    
    return ht
コード例 #37
0
ファイル: family_methods.py プロジェクト: bcajes/hail
def mendel_errors(call, pedigree) -> Tuple[Table, Table, Table, Table]:
    r"""Find Mendel errors; count per variant, individual and nuclear family.

    .. include:: ../_templates/req_tstring.rst

    .. include:: ../_templates/req_tvariant.rst

    .. include:: ../_templates/req_biallelic.rst

    Examples
    --------

    Find all violations of Mendelian inheritance in each (dad, mom, kid) trio in
    a pedigree and return four tables (all errors, errors by family, errors by
    individual, errors by variant):

    >>> ped = hl.Pedigree.read('data/trios.fam')
    >>> all_errors, per_fam, per_sample, per_variant = hl.mendel_errors(dataset['GT'], ped)

    Export all mendel errors to a text file:

    >>> all_errors.export('output/all_mendel_errors.tsv')

    Annotate columns with the number of Mendel errors:

    >>> annotated_samples = dataset.annotate_cols(mendel=per_sample[dataset.s])

    Annotate rows with the number of Mendel errors:

    >>> annotated_variants = dataset.annotate_rows(mendel=per_variant[dataset.locus, dataset.alleles])

    Notes
    -----

    The example above returns four tables, which contain Mendelian violations
    grouped in various ways. These tables are modeled after the `PLINK mendel
    formats <https://www.cog-genomics.org/plink2/formats#mendel>`_, resembling
    the ``.mendel``, ``.fmendel``, ``.imendel``, and ``.lmendel`` formats,
    respectively.

    **First table:** all Mendel errors. This table contains one row per Mendel
    error, keyed by the variant and proband id.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - (column key of `dataset`) (:py:data:`.tstr`) -- Proband ID, key field.
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `mendel_code` (:py:data:`.tint32`) -- Mendel error code, see below.

    **Second table:** errors per nuclear family. This table contains one row
    per nuclear family, keyed by the parents.

        - `pat_id` (:py:data:`.tstr`) -- Paternal ID. (key field)
        - `mat_id` (:py:data:`.tstr`) -- Maternal ID. (key field)
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `children` (:py:data:`.tint32`) -- Number of children in this nuclear family.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this nuclear family.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors at SNPs in this
          nuclear family.

    **Third table:** errors per individual. This table contains one row per
    individual. Each error is counted toward the proband, father, and mother
    according to the `Implicated` in the table below.

        - (column key of `dataset`) (:py:data:`.tstr`) -- Sample ID (key field).
        - `fam_id` (:py:data:`.tstr`) -- Family ID.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual.
        - `snp_errors` (:py:data:`.tint64`) -- Number of Mendel errors involving this
          individual at SNPs.

    **Fourth table:** errors per variant.

        - `locus` (:class:`.tlocus`) -- Variant locus, key field.
        - `alleles` (:class:`.tarray` of :py:data:`.tstr`) -- Variant alleles, key field.
        - `errors` (:py:data:`.tint64`) -- Number of Mendel errors in this variant.

    This method only considers complete trios (two parents and proband with
    defined sex). The code of each Mendel error is determined by the table
    below, extending the
    `Plink classification <https://www.cog-genomics.org/plink2/basic_stats#mendel>`__.

    In the table, the copy state of a locus with respect to a trio is defined
    as follows, where PAR is the `pseudoautosomal region
    <https://en.wikipedia.org/wiki/Pseudoautosomal_region>`__ (PAR) of X and Y
    defined by the reference genome and the autosome is defined by
    :meth:`~hail.genetics.Locus.in_autosome`.

    - Auto -- in autosome or in PAR or female child
    - HemiX -- in non-PAR of X and male child
    - HemiY -- in non-PAR of Y and male child

    `Any` refers to the set \{ HomRef, Het, HomVar, NoCall \} and `~`
    denotes complement in this set.

    +------+---------+---------+--------+----------------------------+
    | Code | Dad     | Mom     | Kid    | Copy State | Implicated    |
    +======+=========+=========+========+============+===============+
    |    1 | HomVar  | HomVar  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    2 | HomRef  | HomRef  | Het    | Auto       | Dad, Mom, Kid |
    +------+---------+---------+--------+------------+---------------+
    |    3 | HomRef  | ~HomRef | HomVar | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    4 | ~HomRef | HomRef  | HomVar | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    5 | HomRef  | HomRef  | HomVar | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    6 | HomVar  | ~HomVar | HomRef | Auto       | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    7 | ~HomVar | HomVar  | HomRef | Auto       | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |    8 | HomVar  | HomVar  | HomRef | Auto       | Kid           |
    +------+---------+---------+--------+------------+---------------+
    |    9 | Any     | HomVar  | HomRef | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   10 | Any     | HomRef  | HomVar | HemiX      | Mom, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   11 | HomVar  | Any     | HomRef | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+
    |   12 | HomRef  | Any     | HomVar | HemiY      | Dad, Kid      |
    +------+---------+---------+--------+------------+---------------+

    See Also
    --------
    :func:`.mendel_error_code`

    Parameters
    ----------
    dataset : :class:`.MatrixTable`
    pedigree : :class:`.Pedigree`

    Returns
    -------
    (:class:`.Table`, :class:`.Table`, :class:`.Table`, :class:`.Table`)
    """
    source = call._indices.source
    if not isinstance(source, MatrixTable):
        raise ValueError("'mendel_errors': expected 'call' to be an expression of 'MatrixTable', found {}".format(
            "expression of '{}'".format(source.__class__) if source is not None else 'scalar expression'))

    source = source.select_entries(__GT=call)
    dataset = require_biallelic(source, 'mendel_errors')
    tm = trio_matrix(dataset, pedigree, complete_trios=True)
    tm = tm.select_entries(mendel_code=hl.mendel_error_code(
        tm.locus,
        tm.is_female,
        tm.father_entry['__GT'],
        tm.mother_entry['__GT'],
        tm.proband_entry['__GT']
    ))
    ck_name = next(iter(source.col_key))
    tm = tm.filter_entries(hl.is_defined(tm.mendel_code))
    tm = tm.rename({'id' : ck_name})

    entries = tm.entries()

    table1 = entries.select('fam_id', 'mendel_code')

    fam_counts = (
        entries
            .group_by(pat_id=entries.father[ck_name], mat_id=entries.mother[ck_name])
            .partition_hint(min(entries.n_partitions(), 8))
            .aggregate(children=hl.len(hl.agg.collect_as_set(entries[ck_name])),
                       errors=hl.agg.count_where(hl.is_defined(entries.mendel_code)),
                       snp_errors=hl.agg.count_where(hl.is_snp(entries.alleles[0], entries.alleles[1]) &
                                                     hl.is_defined(entries.mendel_code)))
    )
    table2 = tm.key_cols_by().cols()
    table2 = table2.select(pat_id=table2.father[ck_name],
                           mat_id=table2.mother[ck_name],
                           fam_id=table2.fam_id,
                           **fam_counts[table2.father[ck_name], table2.mother[ck_name]])
    table2 = table2.key_by('pat_id', 'mat_id').distinct()
    table2 = table2.annotate(errors=hl.or_else(table2.errors, hl.int64(0)),
                             snp_errors=hl.or_else(table2.snp_errors, hl.int64(0)))

    # in implicated, idx 0 is dad, idx 1 is mom, idx 2 is child
    implicated = hl.literal([
        [0, 0, 0],  # dummy
        [1, 1, 1],
        [1, 1, 1],
        [1, 0, 1],
        [0, 1, 1],
        [0, 0, 1],
        [1, 0, 1],
        [0, 1, 1],
        [0, 0, 1],
        [0, 1, 1],
        [0, 1, 1],
        [1, 0, 1],
        [1, 0, 1],
    ], dtype=hl.tarray(hl.tarray(hl.tint64)))

    table3 = tm.annotate_cols(all_errors=hl.or_else(hl.agg.array_sum(implicated[tm.mendel_code]), [0, 0, 0]),
                              snp_errors=hl.or_else(
                                  hl.agg.filter(hl.is_snp(tm.alleles[0], tm.alleles[1]),
                                                hl.agg.array_sum(implicated[tm.mendel_code])),
                                  [0, 0, 0])).key_cols_by().cols()

    table3 = table3.select(xs=[
        hl.struct(**{ck_name: table3.father[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[0],
                     'snp_errors': table3.snp_errors[0]}),
        hl.struct(**{ck_name: table3.mother[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[1],
                     'snp_errors': table3.snp_errors[1]}),
        hl.struct(**{ck_name: table3.proband[ck_name],
                     'fam_id': table3.fam_id,
                     'errors': table3.all_errors[2],
                     'snp_errors': table3.snp_errors[2]}),
    ])
    table3 = table3.explode('xs')
    table3 = table3.select(**table3.xs)
    table3 = (table3.group_by(ck_name, 'fam_id')
              .aggregate(errors=hl.agg.sum(table3.errors),
                         snp_errors=hl.agg.sum(table3.snp_errors))
              .key_by(ck_name))

    table4 = tm.select_rows(errors=hl.agg.count_where(hl.is_defined(tm.mendel_code))).rows()

    return table1, table2, table3, table4