Esempio n. 1
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
            .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                       alleles=[truth.REF, truth.ALT])
            .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
            .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi2 - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Esempio n. 2
0
    def test_tdt(self):
        pedigree = hl.Pedigree.read(resource('tdt.fam'))
        tdt_tab = (hl.transmission_disequilibrium_test(
            hl.split_multi_hts(hl.import_vcf(resource('tdt.vcf'), min_partitions=4)),
            pedigree))

        truth = hl.import_table(
            resource('tdt_results.tsv'),
            types={'POSITION': hl.tint32, 'T': hl.tint32, 'U': hl.tint32,
                   'Chi2': hl.tfloat64, 'Pval': hl.tfloat64})
        truth = (truth
                 .transmute(locus=hl.locus(truth.CHROM, truth.POSITION),
                            alleles=[truth.REF, truth.ALT])
                 .key_by('locus', 'alleles'))

        if tdt_tab.count() != truth.count():
            self.fail('Result has {} rows but should have {} rows'.format(tdt_tab.count(), truth.count()))

        bad = (tdt_tab.filter(hl.is_nan(tdt_tab.p_value), keep=False)
               .join(truth.filter(hl.is_nan(truth.Pval), keep=False), how='outer'))
        bad.describe()

        bad = bad.filter(~(
                (bad.t == bad.T) &
                (bad.u == bad.U) &
                (hl.abs(bad.chi_sq - bad.Chi2) < 0.001) &
                (hl.abs(bad.p_value - bad.Pval) < 0.001)))

        if bad.count() != 0:
            bad.order_by(hl.asc(bad.v)).show()
            self.fail('Found rows in violation of the predicate (see show output)')
Esempio n. 3
0
def export_ldscore(ht, pop):
    hm3_snps = hl.read_table(get_hm3_snplist_path(pop))

    ht = ht.select(CHR=ht.locus.contig,
                   SNP=hl.variant_str(ht.locus, ht.alleles),
                   RSID=ht.rsid,
                   BP=ht.locus.position,
                   L2=ht.ld_score,
                   MAF=0.5 - hl.abs(0.5 - ht.AF))
    count = ht.aggregate(
        hl.struct(M=hl.agg.count(), M_5_50=hl.agg.sum(ht.MAF > 0.05)))
    ht = ht.filter(hl.is_defined(hm3_snps[ht.locus, ht.alleles]))
    ht = ht.key_by().drop('locus', 'alleles', 'MAF')

    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M'),
                     'w') as f:
        f.write(f'{count.M}\n')
    with hadoop_open(get_ld_score_flat_file_path(pop, extension='M_5_50'),
                     'w') as f:
        f.write(f'{count.M_5_50}\n')

    # LD score with variant ids
    ht.drop('RSID').export(get_ld_score_flat_file_path(pop))
    # with rsids
    ht.transmute(SNP=ht.RSID).export(
        get_ld_score_flat_file_path(pop, rsid=True))
Esempio n. 4
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Esempio n. 5
0
    def test_de_novo(self):
        mt = hl.import_vcf(resource('denovo.vcf'))
        mt = mt.filter_rows(mt.locus.in_y_par(), keep=False)  # de_novo_finder doesn't know about y PAR
        ped = hl.Pedigree.read(resource('denovo.fam'))
        r = hl.de_novo(mt, ped, mt.info.ESP)
        r = r.select(
            prior=r.prior,
            kid_id=r.proband.s,
            dad_id=r.father.s,
            mom_id=r.mother.s,
            p_de_novo=r.p_de_novo,
            confidence=r.confidence).key_by('locus', 'alleles', 'kid_id', 'dad_id', 'mom_id')

        truth = hl.import_table(resource('denovo.out'), impute=True, comment='#')
        truth = truth.select(
            locus=hl.locus(truth['Chr'], truth['Pos']),
            alleles=[truth['Ref'], truth['Alt']],
            kid_id=truth['Child_ID'],
            dad_id=truth['Dad_ID'],
            mom_id=truth['Mom_ID'],
            p_de_novo=truth['Prob_dn'],
            confidence=truth['Validation_Likelihood'].split('_')[0]).key_by('locus', 'alleles', 'kid_id', 'dad_id',
                                                                            'mom_id')

        j = r.join(truth, how='outer')
        self.assertTrue(j.all((j.confidence == j.confidence_1) & (hl.abs(j.p_de_novo - j.p_de_novo_1) < 1e-4)))
Esempio n. 6
0
 def get_metric_expr(ht, metric):
     metric_values = hl.agg.collect(ht[metric])
     metric_median = hl.median(metric_values)
     metric_mad = 1.4826 * hl.median(hl.abs(metric_values - metric_median))
     return hl.struct(median=metric_median,
                      mad=metric_mad,
                      upper=metric_median +
                      4 * metric_mad if metric != 'callrate' else 1,
                      lower=metric_median -
                      4 * metric_mad if metric != 'callrate' else 0.99)
Esempio n. 7
0
def get_freq(mt, sex, n_remove, seed):
    r'''
    Get allele frequencies and other SNP information (needed to fix previously 
    created sumstats files)
    '''

    print('... Calculating allele frequency ...')
    mt = mt.annotate_rows(freq=hl.agg.mean(mt.dosage) /
                          2)  #frequency of alternate allele
    mt_rows = mt.rows()
    mt_rows = mt_rows.key_by('rsid')
    mt_rows = mt_rows.annotate(chr=mt_rows.locus.contig,
                               bpos=mt_rows.locus.position)

    ss = hl.import_table(
        wd +
        f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.old.tsv.bgz',
        impute=True,
        key='SNP')

    ss = ss.annotate(
        chr=mt_rows[ss.SNP].chr,
        bpos=mt_rows[ss.SNP].bpos,
        freq=mt_rows[ss.SNP].freq,
        z=((-1) * (ss.beta < 0) * hl.abs(hl.qnorm(ss.p_value / 2)) +
           (ss.beta > 0) * hl.abs(hl.qnorm(ss.p_value / 2))))

    if 'N' in ss.row:
        if 'n' not in ss.row:
            ss = ss.annotate(n=ss.N)
        ss = ss.drop('N')

    ss = ss.rename({'SNP': 'snpid', 'A1': 'a1', 'A2': 'a2', 'p_value': 'pval'})

    ss = ss.key_by()
    ss = ss.select('snpid', 'chr', 'bpos', 'a1', 'a2', 'freq', 'beta', 'z',
                   'pval', 'n')
    ss = ss.key_by('snpid')

    ss.export(
        wd +
        f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz')
Esempio n. 8
0
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file, contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage=hl.gp_dosage(et.GP))
        self.assertTrue(
            et.all((hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage))
                   | (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
Esempio n. 9
0
    def test_import_bgen_dosage_and_gp_dosage_function_agree(self):
        recoding = {'0{}'.format(i): str(i) for i in range(1, 10)}

        sample_file = resource('example.sample')
        bgen_file = resource('example.8bits.bgen')
        hl.index_bgen(bgen_file,
                      contig_recoding=recoding)

        bgenmt = hl.import_bgen(bgen_file, ['GP', 'dosage'], sample_file)
        et = bgenmt.entries()
        et = et.transmute(gp_dosage = hl.gp_dosage(et.GP))
        self.assertTrue(et.all(
            (hl.is_missing(et.dosage) & hl.is_missing(et.gp_dosage)) |
            (hl.abs(et.dosage - et.gp_dosage) < 1e-6)))
Esempio n. 10
0
def get_freq_alt(mt, sex, n_remove, seed):

    ss = hl.import_table(
        wd +
        f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz',
        impute=True,
        key='snpid')
    ss = ss.annotate(z=((-1) * (ss.beta < 0) * hl.abs(hl.qnorm(ss.pval / 2)) +
                        (ss.beta > 0) * hl.abs(hl.qnorm(ss.pval / 2))))

    if 'N' in ss.row:
        if 'n' not in ss.row:
            ss = ss.annotate(n=ss.N)
        ss = ss.drop('N')

    ss = ss.key_by()
    ss = ss.select('snpid', 'chr', 'bpos', 'a1', 'a2', 'freq', 'beta', 'z',
                   'pval', 'n')
    ss = ss.key_by('snpid')

    ss.export(
        wd +
        f'{phen}.gwas.{sex}.n_remove_{n_remove_per_sex}.seed_{seed}.tsv.bgz')
Esempio n. 11
0
def get_platform_specific_intervals(platform_pc_loadings_ht: hl.Table,
                                    threshold: float) -> List[hl.Interval]:
    """
    This takes the platform PC loadings and returns a list of intervals where the sum of the loadings above the given threshold.
    The experimental / untested idea behind this, is that those intervals may be problematic on some platforms.

    :param Table platform_pc_loadings_ht: Platform PCA loadings indexed by interval
    :param float threshold: Minimal threshold
    :param str intervals_path: Path to the intervals file to use (default: b37 exome calling intervals)
    :return: List of intervals with PC loadings above the given threshold
    :rtype: list of Interval
    """
    platform_specific_intervals = platform_pc_loadings_ht.filter(
        hl.sum(hl.abs(platform_pc_loadings_ht.loadings)) >= threshold)
    return platform_specific_intervals.interval.collect()
Esempio n. 12
0
def get_median_and_mad_expr(
    metric_expr: hl.expr.ArrayNumericExpression, k: float = 1.4826
) -> hl.expr.StructExpression:
    """
    Computes the median and median absolute deviation (MAD) for the given expression.
    Note that the default value of k assumes normally distributed data.

    :param metric_expr: Expression to compute median and MAD for
    :param k: The scaling factor for MAD calculation. Default assumes normally distributed data.
    :return: Struct with median and MAD
    """
    return hl.bind(
        lambda x: hl.struct(median=x[1], mad=k * hl.median(hl.abs(x[0] - x[1]))),
        hl.bind(lambda x: hl.tuple([x, hl.median(x)]), hl.agg.collect(metric_expr)),
    )
Esempio n. 13
0
    def filter(self, mt):
        row_filter = mt[self._row_filter].filters if self._row_filter else mt.exclude_row
        col_filter = mt[self._col_filter].filters if self._col_filter else mt.exclude_col

        mt = mt.annotate_rows(cr=hl.or_missing(row_filter == False,
                                               hl.agg.group_by(mt.is_case,
                                                               hl.agg.filter(col_filter == False,
                                                                             variant_qc_aggregator(mt).call_rate))))

        mt = mt.annotate_rows(diff=hl.abs(mt.cr[False] - mt.cr[True]))

        mt = mt.annotate_rows(**{
            'cr_diff': hl.struct(
                filters=hl.agg.any((mt.diff > self._cr_thresh) & (mt[self._initial_row_filter].filters == False)))})

        return mt
Esempio n. 14
0
def metaanalyze_gwas(subsets, gwas_ht_list, sim_name, param_suffix, wd):

    if len(gwas_ht_list) == 1:  # if list is single GWAS, don't meta-analyze
        return gwas_ht_list[0]

    sample_ct_dict = {}

    for subset_idx, tmp_gwas_ht in enumerate(gwas_ht_list, 1):
        sample_ct = subsets.filter(subsets.subset_idx == subset_idx).count()
        sample_ct_dict[subset_idx] = sample_ct
        print(
            f'\n\nmeta-analysis sample count subset {subset_idx}: {sample_ct}\n\n'
        )

    comb_gwas_ht = gwas_ht_list[0].annotate(subset_idx=1, n=sample_ct_dict[1])
    union_args = [
        ht.annotate(subset_idx=subset_idx, n=sample_ct_dict[subset_idx])
        for subset_idx, ht in enumerate(gwas_ht_list[1:], 2)
    ]  # list of gwas_ht's to join
    comb_gwas_ht = comb_gwas_ht.union(*union_args)

    comb_gwas_ht = comb_gwas_ht.annotate(w=1 /
                                         (comb_gwas_ht['standard_error']**2))

    agg_expr = {
        'meta_se':
        hl.sqrt(1 / (hl.agg.sum(comb_gwas_ht.w))),
        'meta_beta':
        hl.agg.sum(comb_gwas_ht['beta'] * comb_gwas_ht.w) /
        hl.agg.sum(comb_gwas_ht.w),
        'meta_EAF':
        hl.agg.sum(comb_gwas_ht['EAF'] * comb_gwas_ht['n']) /
        hl.agg.sum(comb_gwas_ht['n'])
    }

    comb_gwas_ht = comb_gwas_ht.group_by('locus',
                                         'alleles').aggregate(**agg_expr)

    comb_gwas_ht = comb_gwas_ht.annotate(
        meta_pval=2 *
        hl.pnorm(-hl.abs(comb_gwas_ht.meta_beta / comb_gwas_ht.meta_se)))

    meta_gwas_path = f'{wd}/gwas.logreg.{sim_name}.{param_suffix}.tsv.gz'
    comb_gwas_ht.export(meta_gwas_path)
Esempio n. 15
0
def assert_c_king_same_as_hail_king(c_king_path, hail_king_mt):
    actual = hail_king_mt.entries()
    expected = hl.import_table(c_king_path,
                               types={'Kinship': hl.tfloat},
                               key=['ID1', 'ID2'])
    expected = expected.rename({'ID1': 's_1',
                                'ID2': 's',
                                'Kinship': 'phi'})
    expected = expected.key_by('s_1', 's')
    expected = expected.annotate(actual=actual[expected.key])
    expected = expected.select(
        expected=expected.phi,
        actual=expected.actual.phi,
        diff=expected.phi - expected.actual.phi
    )
    expected = expected.annotate(
        # KING prints 4 significant digits; but there are several instances
        # where we calculate 0.XXXX5 whereas KING outputs 0.XXXX
        failure=hl.abs(expected.diff) > 0.00006)
    expected = expected.filter(expected.failure)
    assert expected.count() == 0, expected.collect()
Esempio n. 16
0
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    # get frequency of loadings values
    loadings = hl.read_table(LOADINGS)
    number_of_pcs = hl.len(loadings.loadings).take(1)[0]
    print(loadings.count())
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        freq = Counter(hl.abs(loadings.loadings[i]).collect())
        filename = 'loadings_pc' + str(pc) + '.txt'
        with open(filename, 'w') as f:
            for key, value in freq.items():
                str_value = repr(key) + ' ' + repr(value)
                f.write(str_value + '\n')
        f.close()
        subprocess.run(['gsutil', 'cp', filename, output], check=False)

    # pull out variants that looked like they're capped in the loadings plot
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    # Get NFE samples only
    mt = mt.filter_cols((
        mt.hgdp_1kg_metadata.population_inference.pop == 'nfe')
                        | (mt.s.contains('TOB')))
    intervals = [
        hl.parse_locus(x, reference_genome='GRCh38') for x in [
            'chr1:176163025',
            'chr5:272714',
            'chr5:36104012',
            'chr1:183565810',
            'chr3:58111799',
        ]
    ]
    mt_hits = mt.filter_rows(hl.literal(intervals).contains(mt.locus))
    mt_path = f'{output}/capped_loadings_intervals.mt'
    mt_hits.write(mt_path)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0]
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        p = manhattan_loadings(
            pvals=hl.abs(loadings_ht.loadings[i]),
            locus=loadings_ht.locus,
            title='Loadings of PC ' + str(pc),
            collect_all=True,
        )
        plot_filename = f'{output}/loadings_manhattan_plot_pc' + str(
            pc) + '.png'
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(p).save(f, format='PNG')
        plot_filename_html = 'loadings_pc' + str(pc) + '.html'
        output_file(plot_filename_html)
        save(p)
        subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                       check=False)
Esempio n. 18
0
def create_binned_concordance(data_type: str, truth_sample: str, metric: str,
                              nbins: int, overwrite: bool) -> None:
    """
    Creates and writes a concordance table binned by rank (both absolute and relative) for a given data type, truth sample and metric.

    :param str data_type: One 'exomes' or 'genomes'
    :param str truth_sample: Which truth sample concordance to load
    :param str metric: One of the evaluation metrics (or a RF hash)
    :param int nbins: Number of bins for the rank
    :param bool overwrite: Whether to overwrite existing table
    :return: Nothing -- just writes the table
    :rtype: None
    """

    if hl.hadoop_exists(
            binned_concordance_path(data_type, truth_sample, metric) +
            '/_SUCCESS') and not overwrite:
        logger.warn(
            f"Skipping binned concordance creation as {binned_concordance_path(data_type, truth_sample, metric)} exists and overwrite=False"
        )
    else:
        ht = hl.read_table(
            annotations_ht_path(data_type, f'{truth_sample}_concordance'))
        # Remove 1bp indels for syndip as cannot be trusted
        if truth_sample == 'syndip':
            ht = ht.filter(
                hl.is_indel(ht.alleles[0], ht.alleles[1]) &
                (hl.abs(hl.len(ht.alleles[0]) - hl.len(ht.alleles[1])) == 1),
                keep=False)
            high_conf_intervals = hl.import_locus_intervals(
                syndip_high_conf_regions_bed_path)
        else:
            high_conf_intervals = hl.import_locus_intervals(
                NA12878_high_conf_regions_bed_path)

        lcr = hl.import_locus_intervals(lcr_intervals_path)
        segdup = hl.import_locus_intervals(segdup_intervals_path)
        ht = ht.filter(
            hl.is_defined(high_conf_intervals[ht.locus])
            & hl.is_missing(lcr[ht.locus]) & hl.is_missing(segdup[ht.locus]))

        if metric in ['vqsr', 'rf_2.0.2', 'rf_2.0.2_beta', 'cnn']:
            metric_ht = hl.read_table(score_ranking_path(data_type, metric))
        else:
            metric_ht = hl.read_table(
                rf_path(data_type, 'rf_result', run_hash=metric))

        metric_snvs, metrics_indels = metric_ht.aggregate([
            hl.agg.count_where(
                hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1])),
            hl.agg.count_where(
                ~hl.is_snp(metric_ht.alleles[0], metric_ht.alleles[1]))
        ])

        snvs, indels = ht.aggregate([
            hl.agg.count_where(hl.is_snp(ht.alleles[0], ht.alleles[1])),
            hl.agg.count_where(~hl.is_snp(ht.alleles[0], ht.alleles[1]))
        ])

        ht = ht.annotate_globals(global_counts=hl.struct(
            snvs=metric_snvs, indels=metrics_indels),
                                 counts=hl.struct(snvs=snvs, indels=indels))

        ht = ht.annotate(
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            score=metric_ht[ht.key].score,
            global_rank=metric_ht[ht.key].rank,
            # TP => allele is found in both data sets
            n_tp=ht.concordance[3][3] + ht.concordance[3][4] +
            ht.concordance[4][3] + ht.concordance[4][4],
            # FP => allele is found only in test data set
            n_fp=hl.sum(ht.concordance[3][:2]) + hl.sum(ht.concordance[4][:2]),
            # FN => allele is found only in truth data set
            n_fn=hl.sum(ht.concordance[:2].map(lambda x: x[3] + x[4])))

        ht = add_rank(ht, -1.0 * ht.score)

        ht = ht.annotate(rank=[
            hl.tuple([
                'global_rank', (ht.global_rank + 1) /
                hl.cond(ht.snv, ht.globals.global_counts.snvs,
                        ht.globals.global_counts.indels)
            ]),
            hl.tuple([
                'truth_sample_rank', (ht.rank + 1) / hl.cond(
                    ht.snv, ht.globals.counts.snvs, ht.globals.counts.indels)
            ])
        ])

        ht = ht.explode(ht.rank)
        ht = ht.annotate(rank_name=ht.rank[0], bin=hl.int(ht.rank[1] * nbins))

        ht = ht.group_by('rank_name', 'snv', 'bin').aggregate(
            # Look at site-level metrics -> tp > fp > fn -- only important for multi-sample comparisons
            tp=hl.agg.count_where(ht.n_tp > 0),
            fp=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp > 0)),
            fn=hl.agg.count_where((ht.n_tp == 0) & (ht.n_fp == 0)
                                  & (ht.n_fn > 0)),
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n_alleles=hl.agg.count()).repartition(5)

        ht.write(binned_concordance_path(data_type, truth_sample, metric),
                 overwrite=overwrite)
Esempio n. 19
0
def main(args):
    hl.init()

    # Read in all sumstats
    mt = load_final_sumstats_mt(filter_phenos=True,
                                filter_variants=False,
                                filter_sumstats=True,
                                separate_columns_by_pop=False,
                                annotate_with_nearest_gene=False)

    # Annotate per-entry sample size
    def get_n(pheno_data, i):
        return pheno_data[i].n_cases + hl.or_else(pheno_data[i].n_controls, 0)

    mt = mt.annotate_entries(summary_stats=hl.map(
        lambda x: x[1].annotate(N=hl.or_missing(hl.is_defined(x[1]),
                                                get_n(mt.pheno_data, x[0]))),
        hl.zip_with_index(mt.summary_stats)))

    # Exclude entries with low confidence flag.
    if not args.keep_low_confidence_variants:
        mt = mt.annotate_entries(summary_stats=hl.map(
            lambda x: hl.or_missing(~x.low_confidence, x), mt.summary_stats))

    # Run fixed-effect meta-analysis (all + leave-one-out)
    mt = mt.annotate_entries(unnorm_beta=mt.summary_stats.BETA /
                             (mt.summary_stats.SE**2),
                             inv_se2=1 / (mt.summary_stats.SE**2))
    mt = mt.annotate_entries(
        sum_unnorm_beta=all_and_leave_one_out(mt.unnorm_beta,
                                              mt.pheno_data.pop),
        sum_inv_se2=all_and_leave_one_out(mt.inv_se2, mt.pheno_data.pop))
    mt = mt.transmute_entries(META_BETA=mt.sum_unnorm_beta / mt.sum_inv_se2,
                              META_SE=hl.map(lambda x: hl.sqrt(1 / x),
                                             mt.sum_inv_se2))
    mt = mt.annotate_entries(
        META_Pvalue=hl.map(lambda x: 2 * hl.pnorm(x), -hl.abs(mt.META_BETA /
                                                              mt.META_SE)))

    # Run heterogeneity test (Cochran's Q)
    mt = mt.annotate_entries(META_Q=hl.map(
        lambda x: hl.sum((mt.summary_stats.BETA - x)**2 * mt.inv_se2),
        mt.META_BETA),
                             variant_exists=hl.map(lambda x: ~hl.is_missing(x),
                                                   mt.summary_stats.BETA))
    mt = mt.annotate_entries(META_N_pops=all_and_leave_one_out(
        mt.variant_exists, mt.pheno_data.pop))
    mt = mt.annotate_entries(META_Pvalue_het=hl.map(
        lambda i: hl.pchisqtail(mt.META_Q[i], mt.META_N_pops[i] - 1),
        hl.range(hl.len(mt.META_Q))))

    # Add other annotations
    mt = mt.annotate_entries(
        ac_cases=hl.map(lambda x: x["AF.Cases"] * x.N, mt.summary_stats),
        ac_controls=hl.map(lambda x: x["AF.Controls"] * x.N, mt.summary_stats),
        META_AC_Allele2=all_and_leave_one_out(
            mt.summary_stats.AF_Allele2 * mt.summary_stats.N,
            mt.pheno_data.pop),
        META_N=all_and_leave_one_out(mt.summary_stats.N, mt.pheno_data.pop))
    mt = mt.annotate_entries(
        META_AF_Allele2=mt.META_AC_Allele2 / mt.META_N,
        META_AF_Cases=all_and_leave_one_out(mt.ac_cases, mt.pheno_data.pop) /
        mt.META_N,
        META_AF_Controls=all_and_leave_one_out(mt.ac_controls,
                                               mt.pheno_data.pop) / mt.META_N)
    mt = mt.drop('unnorm_beta', 'inv_se2', 'variant_exists', 'ac_cases',
                 'ac_controls', 'summary_stats', 'META_AC_Allele2')

    # Format everything into array<struct>
    def is_finite_or_missing(x):
        return (hl.or_missing(hl.is_finite(x), x))

    meta_fields = [
        'BETA', 'SE', 'Pvalue', 'Q', 'Pvalue_het', 'N', 'N_pops', 'AF_Allele2',
        'AF_Cases', 'AF_Controls'
    ]
    mt = mt.transmute_entries(meta_analysis=hl.map(
        lambda i: hl.struct(
            **{
                field: is_finite_or_missing(mt[f'META_{field}'][i])
                for field in meta_fields
            }), hl.range(hl.len(mt.META_BETA))))

    col_fields = ['n_cases', 'n_controls']
    mt = mt.annotate_cols(
        **{
            field: all_and_leave_one_out(mt.pheno_data[field],
                                         mt.pheno_data.pop)
            for field in col_fields
        })
    col_fields += ['pop']
    mt = mt.annotate_cols(pop=all_and_leave_one_out(
        mt.pheno_data.pop,
        mt.pheno_data.pop,
        all_f=lambda x: x,
        loo_f=lambda i, x: hl.filter(lambda y: y != x[i], x),
    ))
    mt = mt.transmute_cols(meta_analysis_data=hl.map(
        lambda i: hl.struct(**{field: mt[field][i]
                               for field in col_fields}),
        hl.range(hl.len(mt.pop))))

    mt.describe()
    mt.write(get_meta_analysis_results_path(), overwrite=args.overwrite)

    hl.copy_log('gs://ukb-diverse-pops/combined_results/meta_analysis.log')
Esempio n. 20
0
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    scores = hl.read_table(SCORES)
    scores = scores.annotate(
        study=hl.if_else(scores.s.contains('TOB'), 'TOB-WGS', 'HGDP-1kG'))
    sample_names = scores.s.collect()
    labels = scores.study.collect()
    study = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    # plot by study
    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Study',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', ['#1b9e77', '#d95f02'], study),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'study_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'study_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # plot by continental population
    hgdp1kg_tobwgs = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = scores.annotate(continental_pop=hgdp1kg_tobwgs.cols()[
        scores.s].hgdp_1kg_metadata.population_inference.pop)
    labels = scores.continental_pop.collect()
    # Change TOB-WGS 'none' values to 'TOB-WGS'
    labels = ['TOB-NFE' if x is None else x for x in labels]
    continental_population = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Continental Population',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', turbo(len(continental_population)),
                              continental_population),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'continental_pop_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'continental_pop_pc{pc2}.html',
                                         'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # plot by subpopulation
    scores = scores.annotate(subpop=hgdp1kg_tobwgs.cols()[
        scores.s].hgdp_1kg_metadata.labeled_subpop)
    labels = scores.subpop.collect()
    labels = ['TOB-NFE' if x is None else x for x in labels]
    sub_population = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='Subpopulation',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=scores.scores[pc1].collect(),
                y=scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=4,
            color=factor_cmap('label', turbo(len(sub_population)),
                              sub_population),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'subpop_pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'subpop_pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)

    # Plot loadings
    loadings_ht = hl.read_table(LOADINGS)
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot = manhattan_loadings(
            pvals=hl.abs(loadings_ht.loadings[i]),
            locus=loadings_ht.locus,
            title='Loadings of PC ' + str(pc),
            collect_all=True,
        )
        plot_filename = output_path(f'loadings_pc{pc}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
Esempio n. 21
0
 def get_ac(af, an):
     if filter_mac_instead_of_ac:
         # Note that the underlying file behind get_ukb_af_ht_path() accidentally double af and halve an
         return (1.0 - hl.abs(1.0 - af)) * an
     else:
         return af * an
Esempio n. 22
0
 def get_ac(af, an):
     if filter_mac_instead_of_ac:
         return (0.5 - hl.abs(0.5 - af)) * an
     else:
         return af * an
Esempio n. 23
0
 def get_maf(af):
     return 0.5 - hl.abs(0.5 - af)
Esempio n. 24
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    loadings_ht = hl.read_table(LOADINGS)
    gtf_ht = hl.experimental.import_gtf(
        GTF_FILE,
        reference_genome='GRCh38',
        skip_invalid_contigs=True,
        min_partitions=12,
    )
    number_of_pcs = hl.len(loadings_ht.loadings).take(1)[0] - 1
    for i in range(0, (number_of_pcs)):
        pc = i + 1
        plot_filename = output_path(f'loadings_manhattan_plot_pc{pc}.png',
                                    'web')
        if not hl.hadoop_exists(plot_filename):
            p = manhattan_loadings(
                iteration=i,
                gtf=gtf_ht,
                loadings=loadings_ht,
                title=f'Loadings of PC{pc}',
                collect_all=True,
            )
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(p).save(f, format='PNG')
            html = file_html(p, CDN, 'my plot')
            plot_filename_html = output_path(f'loadings_pc{pc}.html', 'web')
            with hl.hadoop_open(plot_filename_html, 'w') as f:
                f.write(html)

    # Get samples which are driving loadings
    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    scores = hl.read_table(SCORES)
    mt = mt.semi_join_cols(scores)
    loadings_ht = loadings_ht.key_by('locus')
    mt = mt.annotate_rows(loadings=loadings_ht[mt.locus].loadings)

    for dim in range(0, number_of_pcs):
        max_value = mt.aggregate_rows(hl.agg.stats(hl.abs(
            mt.loadings[dim]))).max
        significant_variants = mt.filter_rows(
            hl.abs(mt.loadings[dim]) == max_value)
        significant_variants = hl.sample_qc(significant_variants)
        significant_variant_list = significant_variants.locus.collect()
        print(f'PC{dim}:', significant_variant_list)
        heterozygous_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_het > 0).s.collect()
        homozygous_alternate_samples = significant_variants.filter_cols(
            significant_variants.sample_qc.n_hom_var > 0).s.collect()
        if len(heterozygous_samples) > len(homozygous_alternate_samples):
            homozygous_alternate_samples.extend('null' for _ in range(
                len(heterozygous_samples) - len(homozygous_alternate_samples)))
        elif len(heterozygous_samples) < len(homozygous_alternate_samples):
            heterozygous_samples.extend('null' for _ in range(
                len(homozygous_alternate_samples) - len(heterozygous_samples)))

        # save as html
        html = pd.DataFrame({
            'heterozygous_samples':
            heterozygous_samples,
            'homozygous_alternate_samples':
            homozygous_alternate_samples,
        }).to_html()
        plot_filename_html = output_path(
            f'significant_variants_non_ref_samples{dim}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)
Esempio n. 25
0
for phen_i in idx:
    phen = phens[phen_i]
    output_path = wd + f'{phen}.diffgwasloci.tsv.bgz'
    try:
        subprocess.check_output([f'gsutil', 'ls', output_path]) != None
        print(f'\n#############\n{phen} already completed!\n#############\n')
    except:
        print(
            f'\n#############\nStarting phenotype {phen} ({idx.index(phen_i)+1} of {len(phens)} for paridx {paridx})\n#############\n'
        )
        start = dt.datetime.now()
        f = hl.import_table(path + phen + '.gwas.imputed_v3.female.tsv.bgz',
                            force_bgz=True,
                            impute=True,
                            key='variant')
        m = hl.import_table(path + phen + '.gwas.imputed_v3.male.tsv.bgz',
                            force_bgz=True,
                            impute=True,
                            key='variant')
        both = f.join(m)
        both1 = both.filter(
            ~(both.low_confidence_variant |
              both.low_confidence_variant_1))  #remove low confidence variants
        both2 = both1.annotate(diff=(both1.beta - both1.beta_1),
                               diff_se=hl.sqrt(both1.se**2 + both1.se_1**2))
        both3 = both2.annotate(diff_pval=2 *
                               hl.pnorm(-hl.abs(both2.diff / both2.diff_se)))
        both3.select('diff', 'diff_se', 'diff_pval').export(output_path)
        print(
            f'\n#############\nTime for phenotype {phen}: {round((dt.datetime.now()-start).seconds/60, 2)} min\n#############'
        )
Esempio n. 26
0
def manhattan_loadings(
    iteration,
    gtf,
    loadings,
    title=None,
    size=4,
    hover_fields=None,
    collect_all=False,
    n_divisions=500,
):
    """modify hail manhattan plot"""
    palette = [
        '#1f77b4',
        '#ff7f0e',
        '#2ca02c',
        '#d62728',
        '#9467bd',
        '#8c564b',
        '#e377c2',
        '#7f7f7f',
        '#bcbd22',
        '#17becf',
    ]

    # add gene names, p-values, and locus info
    loadings = loadings.annotate(gene_names=gtf[loadings.locus].gene_name)
    pvals = hl.abs(loadings.loadings[iteration])
    locus = loadings.locus

    if hover_fields is None:
        hover_fields = {}

    hover_fields['locus'] = hl.str(locus)
    hover_fields['gene'] = hl.str(loadings.gene_names)

    source_pd = (
        hl.plot.plots._collect_scatter_plot_data(  # pylint: disable=protected-access
            ('_global_locus', locus.global_position()),
            ('_pval', pvals),
            fields=hover_fields,
            n_divisions=None if collect_all else n_divisions,
        ))
    source_pd['p_value'] = source_pd['_pval']
    source_pd['_contig'] = [
        locus.split(':')[0] for locus in source_pd['locus']
    ]

    observed_contigs = set(source_pd['_contig'])
    ref = locus.dtype.reference_genome
    observed_contigs = [
        contig for contig in ref.contigs.copy() if contig in observed_contigs
    ]

    contig_ticks = [
        ref._contig_global_position(contig)  # pylint: disable=protected-access
        + ref.contig_length(contig) // 2 for contig in observed_contigs
    ]
    color_mapper = CategoricalColorMapper(factors=ref.contigs,
                                          palette=palette[:2] * int(
                                              (len(ref.contigs) + 1) / 2))

    p = figure(title=title,
               x_axis_label='Chromosome',
               y_axis_label='Loadings',
               width=1000)
    (
        p,
        _,
        legend,
        _,
        _,
        _,
    ) = hl.plot.plots._get_scatter_plot_elements(  # pylint: disable=protected-access
        p,
        source_pd,
        x_col='_global_locus',
        y_col='_pval',
        label_cols=['_contig'],
        colors={'_contig': color_mapper},
        size=size,
    )
    legend.visible = False
    p.xaxis.ticker = contig_ticks
    p.xaxis.major_label_overrides = dict(zip(contig_ticks, observed_contigs))
    p.select_one(HoverTool).tooltips = [
        t for t in p.select_one(HoverTool).tooltips if not t[0].startswith('_')
    ]

    return p
Esempio n. 27
0
#aggregation per TSS distance, eQTL p value, and MAC in GTEx
ems = hl.read_table(
    "gs://qingbowang/ems_v1_test/ems_pcausal_gtexvg_all{0}.ht".format(
        tissue_name))
vg = hl.read_table(
    "gs://qingbowang/ems_v1_test/{0}_allpairs.ht".format(tissue_name))
vg = vg.annotate(vg=vg.variant_id + "_" + vg.gene_id)
vg = vg.key_by("vg")

ems = ems.join(vg, how="left")
ems = ems.annotate(conf_gain_log10_bin=hl.ceil(ems.confidence_gain_log10))

#tss dist bin
ems = ems.annotate(
    tss_dist_bin_unsigned=hl.ceil(hl.log10(hl.abs(ems.tss_distance))))
ems = ems.transmute(
    tss_dist_bin=hl.cond(ems.tss_distance > 0, ems.tss_dist_bin_unsigned,
                         ems.tss_dist_bin_unsigned * -1))
agged = ems.group_by("tss_dist_bin",
                     "conf_gain_log10_bin").aggregate(n=hl.agg.count())
agged.export("gs://qingbowang/ems_v1_test/tmp/{0}_tssdist_vs_EMS.tsv".format(
    tissue_name))

#p value
ems = ems.annotate(
    pval_bin=hl.case().when(ems.pval_nominal < 5 * 10**-8, -1).when(
        ems.pval_nominal > 0.05, 1).default(0))
agged = ems.group_by("pval_bin",
                     "conf_gain_log10_bin").aggregate(n=hl.agg.count())
agged.export(
Esempio n. 28
0
def score_bin_agg(
    ht: hl.GroupedTable, fam_stats_ht: hl.Table
) -> Dict[str, hl.expr.Aggregation]:
    """
    Default aggregation function to add aggregations for min/max of score, number of ClinVar variants, number of truth
    variants (omni, mills, hapmap, and kgp_phase1), and family statistics.

    .. note::

        This function uses `ht._parent` to get the origin Table from the GroupedTable for the aggregation

    This can easily be combined with the GroupedTable returned by `compute_grouped_binned_ht`

    Example:

    .. code-block:: python

        binned_ht = create_binned_ht(...)
        grouped_binned_ht = compute_grouped_binned_ht(binned_ht)
        agg_ht = grouped_binned_ht.aggregate(score_bin_agg(**grouped_binned_ht, ...))

    .. note::

        The following annotations should be present:

        In ht:
            - score
            - singleton
            - positive_train_site
            - negative_train_site
            - ac_raw - expected that this is the raw allele count before adj filtering
            - ac - expected that this is the allele count after adj filtering
            - ac_qc_samples_unrelated_raw - allele count before adj filtering for unrelated samples passing sample QC
            - info - struct that includes QD, FS, and MQ in order to add an annotation for fail_hard_filters

        In truth_ht:
            - omni
            - mills
            - hapmap
            - kgp_phase1_hc

        In fam_stats_ht:
            - n_de_novos_adj
            - n_de_novos_raw
            - n_transmitted_raw
            - n_untransmitted_raw

    Automatic aggregations that will be done are:
        - `min_score` - minimun of score annotation per group
        - `max_score` - maiximum of score annotation per group
        - `n` - count of variants per group
        - `n_ins` - count of insertion per group
        - `n_ins` - count of insertion per group
        - `n_del` - count of deletions per group
        - `n_ti` - count of transitions per group
        - `n_tv` - count of trnasversions per group
        - `n_1bp_indel` - count of one base pair indels per group
        - `n_mod3bp_indel` - count of indels with a length divisible by three per group
        - `n_singleton` - count of singletons per group
        - `fail_hard_filters` - count of variants per group with QD < 2 | FS > 60 | MQ < 30
        - `n_vqsr_pos_train` - count of variants that were a VQSR positive train site per group
        - `n_vqsr_neg_train` - count of variants that were a VQSR negative train site per group
        - `n_clinvar` - count of clinvar variants
        - `n_de_novos_singleton_adj` - count of singleton de novo variants after adj filtration
        - `n_de_novo_singleton` - count of raw unfiltered singleton de novo variants
        - `n_de_novos_adj` - count of adj filtered de novo variants
        - `n_de_novos` - count of raw unfiltered de novo variants
        - `n_trans_singletons` - count of transmitted singletons
        - `n_untrans_singletons` - count of untransmitted singletons
        - `n_omni` - count of omni truth variants
        - `n_mills` - count of mills truth variants
        - `n_hapmap` - count of hapmap truth variants
        - `n_kgp_phase1_hc` - count of 1000 genomes phase 1 high confidence truth variants

    :param ht: Table that aggregation will be performed on
    :param fam_stats_ht: Path to family statistics HT
    :return: a dictionary containing aggregations to perform on ht
    """
    # Annotate binned table with the evaluation data
    ht = ht._parent
    indel_length = hl.abs(ht.alleles[0].length() - ht.alleles[1].length())
    # Load external evaluation data
    build = get_reference_genome(ht.locus).name
    clinvar = (
        grch37_resources.reference_data.clinvar
        if build == "GRCh37"
        else grch38_resources.reference_data.clinvar
    ).ht()[ht.key]
    truth_data = (
        grch37_resources.reference_data.get_truth_ht()
        if build == "GRCh37"
        else grch38_resources.reference_data.get_truth_ht()
    )[ht.key]
    fam = fam_stats_ht[ht.key]

    return dict(
        min_score=hl.agg.min(ht.score),
        max_score=hl.agg.max(ht.score),
        n=hl.agg.count(),
        n_ins=hl.agg.count_where(hl.is_insertion(ht.alleles[0], ht.alleles[1])),
        n_del=hl.agg.count_where(hl.is_deletion(ht.alleles[0], ht.alleles[1])),
        n_ti=hl.agg.count_where(hl.is_transition(ht.alleles[0], ht.alleles[1])),
        n_tv=hl.agg.count_where(hl.is_transversion(ht.alleles[0], ht.alleles[1])),
        n_1bp_indel=hl.agg.count_where(indel_length == 1),
        n_mod3bp_indel=hl.agg.count_where((indel_length % 3) == 0),
        n_singleton=hl.agg.count_where(ht.singleton),
        fail_hard_filters=hl.agg.count_where(
            (ht.info.QD < 2) | (ht.info.FS > 60) | (ht.info.MQ < 30)
        ),
        n_pos_train=hl.agg.count_where(ht.positive_train_site),
        n_neg_train=hl.agg.count_where(ht.negative_train_site),
        n_clinvar=hl.agg.count_where(hl.is_defined(clinvar)),
        n_de_novos_singleton_adj=hl.agg.filter(
            ht.ac == 1, hl.agg.sum(fam.n_de_novos_adj)
        ),
        n_de_novo_singleton=hl.agg.filter(
            ht.ac_raw == 1, hl.agg.sum(fam.n_de_novos_raw)
        ),
        n_de_novos_adj=hl.agg.sum(fam.n_de_novos_adj),
        n_de_novo=hl.agg.sum(fam.n_de_novos_raw),
        n_trans_singletons=hl.agg.filter(
            ht.ac_raw == 2, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_untrans_singletons=hl.agg.filter(
            (ht.ac_raw < 3) & (ht.ac_qc_samples_unrelated_raw == 1),
            hl.agg.sum(fam.n_untransmitted_raw),
        ),
        n_train_trans_singletons=hl.agg.filter(
            (ht.ac_raw == 2) & ht.positive_train_site, hl.agg.sum(fam.n_transmitted_raw)
        ),
        n_omni=hl.agg.count_where(truth_data.omni),
        n_mills=hl.agg.count_where(truth_data.mills),
        n_hapmap=hl.agg.count_where(truth_data.hapmap),
        n_kgp_phase1_hc=hl.agg.count_where(truth_data.kgp_phase1_hc),
    )
def create_binned_data_initial(ht: hl.Table, data: str, data_type: str, n_bins: int) -> hl.Table:
    # Count variants for ranking
    count_expr = {x: hl.agg.filter(hl.is_defined(ht[x]), hl.agg.counter(hl.cond(hl.is_snp(
        ht.alleles[0], ht.alleles[1]), 'snv', 'indel'))) for x in ht.row if x.endswith('rank')}
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}")
    ht_truth_data = hl.read_table(
        f"{temp_dir}/ddd-elgh-ukbb/variant_qc/truthset_table.ht")
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)
    ht = ht.annotate(
        **ht_truth_data[ht.key],
        # **fam_ht[ht.key],
        # **gnomad_ht[ht.key],
        # **denovo_ht[ht.key],
        # clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length()-ht.alleles[1].length()),
        rank_bins=hl.array(
            [hl.Struct(
                rank_id=rank_name,
                bin=hl.int(hl.ceil(hl.float(ht[rank_name] + 1) / hl.floor(ht.globals.rank_variant_counts[rank_name][hl.cond(
                    hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv', 'indel')] / n_bins)))
            )
                for rank_name in rank_variant_counts]
        ),
        # lcr=hl.is_defined(lcr_intervals[ht.locus])
    )

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(
        rank_id=ht.rank_bins.rank_id,
        bin=ht.rank_bins.bin
    )
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'{tmp_dir}/gnomad_score_binning_tmp.ht', overwrite=True)

    # Create binned data
    return (
        ht
        .group_by(
            rank_id=ht.rank_id,
            contig=ht.locus.contig,
            snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
            bi_allelic=hl.is_defined(ht.biallelic_rank),
            singleton=ht.transmitted_singleton,
            trans_singletons=hl.is_defined(ht.singleton_rank),
            de_novo_high_quality=ht.de_novo_high_quality_rank,
            de_novo_medium_quality=hl.is_defined(
                ht.de_novo_medium_quality_rank),
            de_novo_synonymous=hl.is_defined(ht.de_novo_synonymous_rank),
            # release_adj=ht.ac > 0,
            bin=ht.bin
        )._set_buffer_size(20000)
        .aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(hl.is_transition(
                ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(hl.is_transversion(
                ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            # n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.transmitted_singleton),
            n_high_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.99),
            n_validated_DDD_denovos=hl.agg.count_where(
                ht.inheritance.contains("De novo")),
            n_medium_quality_de_novos=hl.agg.count_where(
                ht.de_novo_data.p_de_novo[0] > 0.5),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.de_novo_data.confidence[0] == 'HIGH'),
            n_de_novo=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.sum(
                ht.family_stats.mendel[0].errors)),
            n_high_quality_de_novos_synonymous=hl.agg.count_where(
                (ht.de_novo_data.p_de_novo[0] > 0.99) & (ht.consequence == "synonymous_variant")),
            # n_de_novo_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(ht.family_stats.unrelated_qc_callstats.AC[0][1] == 0, hl.agg.count_where(
                ht.family_stats.mendel[0].errors > 0)),
            # n_de_novo_sites_no_lcr=hl.agg.filter(~ht.lcr & (
            #    ht.family_stats.unrelated_qc_callstats.AC[1] == 0), hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_trans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].t)),
            n_untrans_singletons=hl.agg.filter((ht.ac_raw < 3) & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_untrans_singletons_synonymous=hl.agg.filter((ht.ac_raw < 3) & (ht.consequence == "synonymous_variant") & (
                ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1), hl.agg.sum(ht.family_stats.tdt[0].u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[0][1] == 1) & (ht.family_stats.tdt[0].t == 1)),
            n_omni=hl.agg.count_where(ht.omni),
            n_mills=hl.agg.count_where(ht.mills),
            n_hapmap=hl.agg.count_where(ht.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.kgp_phase1_hc),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            # n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            # n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)
        )
    )
def create_binned_data(ht: hl.Table, data: str, data_type: str,
                       n_bins: int) -> hl.Table:
    """
    Creates binned data from a rank Table grouped by rank_id (rank, biallelic, etc.), contig, snv, bi_allelic and singleton
    containing the information needed for evaluation plots.

    :param Table ht: Input rank table
    :param str data: Which data/run hash is being created
    :param str data_type: one of 'exomes' or 'genomes'
    :param int n_bins: Number of bins.
    :return: Binned Table
    :rtype: Table
    """

    # Count variants for ranking
    count_expr = {
        x: hl.agg.filter(
            hl.is_defined(ht[x]),
            hl.agg.counter(
                hl.cond(hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                        'indel')))
        for x in ht.row if x.endswith('rank')
    }
    rank_variant_counts = ht.aggregate(hl.Struct(**count_expr))
    logger.info(
        f"Found the following variant counts:\n {pformat(rank_variant_counts)}"
    )
    ht = ht.annotate_globals(rank_variant_counts=rank_variant_counts)

    # Load external evaluation data
    clinvar_ht = hl.read_table(clinvar_ht_path)
    denovo_ht = get_validated_denovos_ht()
    if data_type == 'exomes':
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_exomes.high_quality)
    else:
        denovo_ht = denovo_ht.filter(denovo_ht.gnomad_genomes.high_quality)
    denovo_ht = denovo_ht.select(
        validated_denovo=denovo_ht.validated,
        high_confidence_denovo=denovo_ht.Confidence == 'HIGH')
    ht_truth_data = hl.read_table(annotations_ht_path(data_type, 'truth_data'))
    fam_ht = hl.read_table(annotations_ht_path(data_type, 'family_stats'))
    fam_ht = fam_ht.select(family_stats=fam_ht.family_stats[0])
    gnomad_ht = get_gnomad_data(data_type).rows()
    gnomad_ht = gnomad_ht.select(
        vqsr_negative_train_site=gnomad_ht.info.NEGATIVE_TRAIN_SITE,
        vqsr_positive_train_site=gnomad_ht.info.POSITIVE_TRAIN_SITE,
        fail_hard_filters=(gnomad_ht.info.QD < 2) | (gnomad_ht.info.FS > 60) |
        (gnomad_ht.info.MQ < 30))
    lcr_intervals = hl.import_locus_intervals(lcr_intervals_path)

    ht = ht.annotate(
        **ht_truth_data[ht.key],
        **fam_ht[ht.key],
        **gnomad_ht[ht.key],
        **denovo_ht[ht.key],
        clinvar=hl.is_defined(clinvar_ht[ht.key]),
        indel_length=hl.abs(ht.alleles[0].length() - ht.alleles[1].length()),
        rank_bins=hl.array([
            hl.Struct(
                rank_id=rank_name,
                bin=hl.int(
                    hl.ceil(
                        hl.float(ht[rank_name] + 1) / hl.floor(
                            ht.globals.rank_variant_counts[rank_name][hl.cond(
                                hl.is_snp(ht.alleles[0], ht.alleles[1]), 'snv',
                                'indel')] / n_bins))))
            for rank_name in rank_variant_counts
        ]),
        lcr=hl.is_defined(lcr_intervals[ht.locus]))

    ht = ht.explode(ht.rank_bins)
    ht = ht.transmute(rank_id=ht.rank_bins.rank_id, bin=ht.rank_bins.bin)
    ht = ht.filter(hl.is_defined(ht.bin))

    ht = ht.checkpoint(
        f'gs://gnomad-tmp/gnomad_score_binning_{data_type}_tmp_{data}.ht',
        overwrite=True)

    # Create binned data
    return (ht.group_by(
        rank_id=ht.rank_id,
        contig=ht.locus.contig,
        snv=hl.is_snp(ht.alleles[0], ht.alleles[1]),
        bi_allelic=hl.is_defined(ht.biallelic_rank),
        singleton=ht.singleton,
        release_adj=ht.ac > 0,
        bin=ht.bin)._set_buffer_size(20000).aggregate(
            min_score=hl.agg.min(ht.score),
            max_score=hl.agg.max(ht.score),
            n=hl.agg.count(),
            n_ins=hl.agg.count_where(
                hl.is_insertion(ht.alleles[0], ht.alleles[1])),
            n_del=hl.agg.count_where(
                hl.is_deletion(ht.alleles[0], ht.alleles[1])),
            n_ti=hl.agg.count_where(
                hl.is_transition(ht.alleles[0], ht.alleles[1])),
            n_tv=hl.agg.count_where(
                hl.is_transversion(ht.alleles[0], ht.alleles[1])),
            n_1bp_indel=hl.agg.count_where(ht.indel_length == 1),
            n_mod3bp_indel=hl.agg.count_where((ht.indel_length % 3) == 0),
            n_clinvar=hl.agg.count_where(ht.clinvar),
            n_singleton=hl.agg.count_where(ht.singleton),
            n_validated_de_novos=hl.agg.count_where(ht.validated_denovo),
            n_high_confidence_de_novos=hl.agg.count_where(
                ht.high_confidence_denovo),
            n_de_novo=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.sum(ht.family_stats.mendel.errors)),
            n_de_novo_sites=hl.agg.filter(
                ht.family_stats.unrelated_qc_callstats.AC[1] == 0,
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_de_novo_sites_no_lcr=hl.agg.filter(
                ~ht.lcr & (ht.family_stats.unrelated_qc_callstats.AC[1] == 0),
                hl.agg.count_where(ht.family_stats.mendel.errors > 0)),
            n_trans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.t)),
            n_untrans_singletons=hl.agg.filter(
                (ht.info_ac < 3) &
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1),
                hl.agg.sum(ht.family_stats.tdt.u)),
            n_train_trans_singletons=hl.agg.count_where(
                (ht.family_stats.unrelated_qc_callstats.AC[1] == 1)
                & (ht.family_stats.tdt.t == 1)),
            n_omni=hl.agg.count_where(ht.truth_data.omni),
            n_mills=hl.agg.count_where(ht.truth_data.mills),
            n_hapmap=hl.agg.count_where(ht.truth_data.hapmap),
            n_kgp_high_conf_snvs=hl.agg.count_where(
                ht.truth_data.kgp_high_conf_snvs),
            fail_hard_filters=hl.agg.count_where(ht.fail_hard_filters),
            n_vqsr_pos_train=hl.agg.count_where(ht.vqsr_positive_train_site),
            n_vqsr_neg_train=hl.agg.count_where(ht.vqsr_negative_train_site)))
Esempio n. 31
0
    hl.set_global_seed(seed)

    gt_sim_suffix = f'bn.npops_{n_pops}.nvars_{n_vars}.nsim_{n_sim}' if sim_name[:
                                                                                 3] == 'bn_' else ''  # suffix for genotype simulation (empty string if using ukb data)
    param_suffix = f'{gt_sim_suffix}.h2_{h2}.pi_{pi}.K_{K}.seed_{seed}'
    betas_path = f'{smiles_wd}/betas.{param_suffix}.tsv.gz'
    phens_path = f'{smiles_wd}/phens.{param_suffix}.tsv.gz'

    if sim_name[:3] == 'bn_':
        mt = hl.balding_nichols_model(n_populations=n_pops,
                                      n_samples=n_sim,
                                      n_variants=n_vars,
                                      fst=fst)

        mt = mt.filter_rows(
            (hl.abs(hl.agg.mean(mt.GT.n_alt_alleles()) / 2 - 0.5) <
             0.5))  # remove invariant SNPs
        mt = mt.annotate_cols(s=hl.str(mt.sample_idx))

        if hl.hadoop_is_file(betas_path) and hl.hadoop_is_file(phens_path):
            #            betas = hl.import_table(betas_path, impute=True, force=True)
            #            betas = betas.annotate(locus = hl.parse_locus(betas.locus),
            #                                   alleles = betas.alleles.replace('\[\"','').replace('\"\]','').split('\",\"'))
            #            betas = betas.key_by('locus','alleles')

            phens = hl.import_table(phens_path,
                                    key=['s'],
                                    types={'s': hl.tstr},
                                    impute=True,
                                    force=True)
Esempio n. 32
0
def main():
    args = parse_args()

    tables = []
    for i, path in enumerate(args.paths):

        ht = import_SJ_out_tab(path)
        ht = ht.key_by("chrom", "start_1based", "end_1based")

        if args.normalize_read_counts:
            ht = ht.annotate_globals(
                unique_reads_in_sample=ht.aggregate(hl.agg.sum(
                    ht.unique_reads)),
                multi_mapped_reads_in_sample=ht.aggregate(
                    hl.agg.sum(ht.multi_mapped_reads)),
            )

        # add 'interval' column
        #ht = ht.annotate(interval=hl.interval(
        #    hl.locus(ht.chrom, ht.start_1based, reference_genome=reference_genome),
        #    hl.locus(ht.chrom, ht.end_1based, reference_genome=reference_genome),))

        tables.append(ht)

    # compute mean
    if args.normalize_read_counts:
        mean_unique_reads_in_sample = sum(
            [hl.eval(ht.unique_reads_in_sample)
             for ht in tables]) / float(len(tables))
        mean_multi_mapped_reads_in_sample = sum(
            [hl.eval(ht.multi_mapped_reads_in_sample)
             for ht in tables]) / float(len(tables))
        print(
            f"mean_unique_reads_in_sample: {mean_unique_reads_in_sample:01f}, mean_multi_mapped_reads_in_sample: {mean_multi_mapped_reads_in_sample:01f}"
        )

    combined_ht = None
    for i, ht in enumerate(tables):
        print(f"Processing table #{i} out of {len(tables)}")

        if args.normalize_read_counts:
            unique_reads_multiplier = mean_unique_reads_in_sample / float(
                hl.eval(ht.unique_reads_in_sample))
            multi_mapped_reads_multiplier = mean_multi_mapped_reads_in_sample / float(
                hl.eval(ht.multi_mapped_reads_in_sample))
            print(
                f"unique_reads_multiplier: {unique_reads_multiplier:01f}, multi_mapped_reads_multiplier: {multi_mapped_reads_multiplier:01f}"
            )

        ht = ht.annotate(
            strand_counter=hl.or_else(
                hl.switch(ht.strand).when(1, 1).when(2, -1).or_missing(), 0),
            num_samples_with_this_junction=1,
        )

        if args.normalize_read_counts:
            ht = ht.annotate(
                unique_reads=hl.int32(ht.unique_reads *
                                      unique_reads_multiplier),
                multi_mapped_reads=hl.int32(ht.multi_mapped_reads *
                                            multi_mapped_reads_multiplier),
            )

        if combined_ht is None:
            combined_ht = ht
            continue

        print("----")
        print_stats(path, ht)

        combined_ht = combined_ht.join(ht, how="outer")
        combined_ht = combined_ht.transmute(
            strand=hl.or_else(
                combined_ht.strand, combined_ht.strand_1
            ),  ## in rare cases, the strand for the same junction may differ across samples, so use a 2-step process that assigns strand based on majority of samples
            strand_counter=hl.sum([
                combined_ht.strand_counter, combined_ht.strand_counter_1
            ]),  # samples vote on whether strand = 1 (eg. '+') or 2 (eg. '-')
            intron_motif=hl.or_else(combined_ht.intron_motif,
                                    combined_ht.intron_motif_1
                                    ),  ## double-check that left == right?
            known_splice_junction=hl.or_else(
                hl.cond((combined_ht.known_splice_junction == 1) |
                        (combined_ht.known_splice_junction_1 == 1), 1, 0),
                0),  ## double-check that left == right?
            unique_reads=hl.sum(
                [combined_ht.unique_reads, combined_ht.unique_reads_1]),
            multi_mapped_reads=hl.sum([
                combined_ht.multi_mapped_reads,
                combined_ht.multi_mapped_reads_1
            ]),
            maximum_overhang=hl.max(
                [combined_ht.maximum_overhang,
                 combined_ht.maximum_overhang_1]),
            num_samples_with_this_junction=hl.sum([
                combined_ht.num_samples_with_this_junction,
                combined_ht.num_samples_with_this_junction_1
            ]),
        )

        combined_ht = combined_ht.checkpoint(
            f"checkpoint{i % 2}.ht", overwrite=True)  #, _read_if_exists=True)

    total_junctions_count = combined_ht.count()
    strand_conflicts_count = combined_ht.filter(
        hl.abs(combined_ht.strand_counter) /
        hl.float(combined_ht.num_samples_with_this_junction) < 0.1,
        keep=True).count()

    # set final strand value to 1 (eg. '+') or 2 (eg. '-') or 0 (eg. uknown) based on the setting in the majority of samples
    combined_ht = combined_ht.annotate(
        strand=hl.case().when(combined_ht.strand_counter > 0, 1).when(
            combined_ht.strand_counter < 0, 2).default(0))

    combined_ht = combined_ht.annotate_globals(combined_tables=args.paths,
                                               n_combined_tables=len(
                                                   args.paths))

    if strand_conflicts_count:
        print(
            f"WARNING: Found {strand_conflicts_count} strand_conflicts out of {total_junctions_count} total_junctions"
        )

    # write as HT
    combined_ht = combined_ht.checkpoint(
        f"combined.SJ.out.ht", overwrite=True)  #, _read_if_exists=True)

    ## write as tsv
    output_prefix = f"combined.{len(tables)}_samples{'.normalized_counts' if args.normalize_read_counts else ''}"
    combined_ht = combined_ht.key_by()
    combined_ht.export(f"{output_prefix}.with_header.combined.SJ.out.tab",
                       header=True)
    combined_ht = combined_ht.select(
        "chrom",
        "start_1based",
        "end_1based",
        "strand",
        "intron_motif",
        "known_splice_junction",
        "unique_reads",
        "multi_mapped_reads",
        "maximum_overhang",
    )
    combined_ht.export(f"{output_prefix}.SJ.out.tab", header=False)

    print(
        f"unique_reads_in combined table: {combined_ht.aggregate(hl.agg.sum(combined_ht.unique_reads))}"
    )