Exemple #1
0
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    snp_chip = hl.read_matrix_table(SNP_CHIP).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in snp-chip data after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)
    ).select_cols()
    snp_chip = snp_chip.select_entries(snp_chip.GT).select_cols()
    snp_chip = snp_chip.key_cols_by(s=snp_chip.s + '_snp_chip')
    tob_combined = tob_wgs.union_cols(snp_chip)
    tob_combined = tob_combined.cache()
    print(tob_combined.count_rows())

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        tob_combined.GT, compute_loadings=True, k=20
    )
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
def query():
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    tob_wgs = hl.read_matrix_table(TOB_WGS)
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)

    # keep loci that are contained in the densified, filtered tob-wgs mt
    hgdp_1kg = hgdp_1kg.semi_join_rows(tob_wgs.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA)).select_cols()
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT).select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    # save this for population-level PCAs
    mt_path = output_path('hgdp1kg_tobwgs_joined_all_samples.mt')
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)

    # Perform PCA
    eigenvalues_path = output_path('eigenvalues.ht')
    scores_path = output_path('scores.ht')
    loadings_path = output_path('loadings.ht')
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    hl.Table.from_pandas(pd.DataFrame(eigenvalues)).export(eigenvalues_path)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
Exemple #3
0
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols((
            mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
                            | (mt.s.contains('TOB')))
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        mt.GT, compute_loadings=True, k=20)
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)

    # get TOB-WGS allele frequencies
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    tob_wgs = tob_wgs.annotate_rows(
        gt_stats=hl.agg.call_stats(tob_wgs.GT, tob_wgs.alleles))

    # Get gnomAD allele frequency of variants that aren't in TOB-WGS
    loadings_gnomad = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by(
        'locus', 'alleles')
    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    hgdp_1kg_row = hgdp_1kg.rows()[loadings_gnomad.locus,
                                   loadings_gnomad.alleles]
    tob_wgs_row = tob_wgs.rows()[loadings_gnomad.locus,
                                 loadings_gnomad.alleles]
    loadings_gnomad = loadings_gnomad.annotate(
        gnomad_AF=hgdp_1kg_row.gnomad_freq.AF,
        gnomad_popmax_AF=hgdp_1kg_row.gnomad_popmax.AF,
        TOB_WGS_AF=tob_wgs_row.gt_stats.AF,
    )
    population_af_metadata = hgdp_1kg.gnomad_freq_meta.collect()
    loadings_gnomad = loadings_gnomad.annotate_globals(
        gnomad_freq_meta=population_af_metadata)
    gnomad_variants = loadings_gnomad.drop('loadings')
    gnomad_variants_path = f'{output}/gnomad_annotated_variants.mt'
    gnomad_variants.write(gnomad_variants_path)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')

    # filter to loci that are contained in both matrix tables after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])

    # choose variants based off of gnomAD v3 parameters
    hgdp1kg_tobwgs_joined = hl.variant_qc(hgdp1kg_tobwgs_joined)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_rows(
        IB=hl.agg.inbreeding(hgdp1kg_tobwgs_joined.GT,
                             hgdp1kg_tobwgs_joined.variant_qc.AF[1]))
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        (hl.len(hgdp1kg_tobwgs_joined.alleles) == 2)
        & (hgdp1kg_tobwgs_joined.locus.in_autosome())
        & (hgdp1kg_tobwgs_joined.variant_qc.AF[1] > 0.01)
        & (hgdp1kg_tobwgs_joined.variant_qc.call_rate > 0.99)
        & (hgdp1kg_tobwgs_joined.IB.f_stat > -0.25))

    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.cache()
    nrows = hgdp1kg_tobwgs_joined.count_rows()
    print(f'hgdp1kg_tobwgs_joined.count_rows() = {nrows}')
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.sample_rows(
        NUM_ROWS_BEFORE_LD_PRUNE / nrows, seed=12345)

    pruned_variant_table = hl.ld_prune(hgdp1kg_tobwgs_joined.GT,
                                       r2=0.1,
                                       bp_window_size=500000)
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.filter_rows(
        hl.is_defined(pruned_variant_table[hgdp1kg_tobwgs_joined.row_key]))
    mt_path = f'{output}/tob_wgs_hgdp_1kg_filtered_variants.mt'
    hgdp1kg_tobwgs_joined.write(mt_path)
def query(output):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    hgdp_1kg = hl.read_matrix_table(GNOMAD_HGDP_1KG_MT)
    tob_wgs = hl.read_matrix_table(TOB_WGS).key_rows_by('locus', 'alleles')
    loadings = hl.read_table(GNOMAD_LIFTOVER_LOADINGS).key_by(
        'locus', 'alleles')

    # filter to loci that are contained in both tables and the loadings after densifying
    tob_wgs = hl.experimental.densify(tob_wgs)
    hgdp_1kg = hgdp_1kg.filter_rows(
        hl.is_defined(loadings.index(hgdp_1kg['locus'], hgdp_1kg['alleles']))
        & hl.is_defined(
            tob_wgs.index_rows(hgdp_1kg['locus'], hgdp_1kg['alleles'])))
    tob_wgs = tob_wgs.semi_join_rows(hgdp_1kg.rows())

    # Entries and columns must be identical
    tob_wgs_select = tob_wgs.select_entries(
        GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    hgdp_1kg_select = hgdp_1kg.select_entries(hgdp_1kg.GT)
    hgdp_1kg_select = hgdp_1kg_select.select_cols()
    # Join datasets
    hgdp1kg_tobwgs_joined = hgdp_1kg_select.union_cols(tob_wgs_select)
    # Add in metadata information
    hgdp_1kg_metadata = hgdp_1kg.cols()
    hgdp1kg_tobwgs_joined = hgdp1kg_tobwgs_joined.annotate_cols(
        hgdp_1kg_metadata=hgdp_1kg_metadata[hgdp1kg_tobwgs_joined.s])
    mt_path = f'{output}/hgdp1kg_tobwgs_joined_all_samples.mt'
    if not hl.hadoop_exists(mt_path):
        hgdp1kg_tobwgs_joined.write(mt_path)
    hgdp1kg_tobwgs_joined = hl.read_matrix_table(mt_path)

    # Perform PCA
    eigenvalues_path = f'{output}/eigenvalues.csv'
    scores_path = f'{output}/scores.ht'
    loadings_path = f'{output}/loadings.ht'
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        hgdp1kg_tobwgs_joined.GT, compute_loadings=True, k=20)
    # save the list of eigenvalues
    eigenvalues_df = pd.DataFrame(eigenvalues)
    eigenvalues_df.to_csv(eigenvalues_path, index=False)
    # save the scores and loadings as a hail table
    scores.write(scores_path, overwrite=True)
    loadings.write(loadings_path, overwrite=True)
def query(output, pop):  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    mt = hl.read_matrix_table(HGDP1KG_TOBWGS)
    if pop:
        # Get samples from the specified population only
        mt = mt.filter_cols((
            mt.hgdp_1kg_metadata.population_inference.pop == pop.lower())
                            | (mt.s.contains('TOB')))
    else:
        mt = mt.filter_cols(mt.s.contains('TOB'))

    mt = mt.annotate_rows(af=hl.agg.mean(mt.GT.n_alt_alleles()) / 2)
    loadings = hl.read_table(LOADINGS)
    loadings = loadings.annotate(af=mt.rows()[loadings.key].af)
    reprocessed_samples = hl.read_matrix_table(REPROCESSED_1KG)
    reprocessed_samples = hl.experimental.densify(reprocessed_samples)
    reprocessed_samples = reprocessed_samples.annotate_entries(
        GT=lgt_to_gt(reprocessed_samples.LGT, reprocessed_samples.LA))
    ht = pc_project(reprocessed_samples.GT, loadings.loadings, loadings.af)
    ht = ht.key_by(s=ht.s + '_reprocessed')
    pcs = hl.read_table(SCORES)
    union_scores = ht.union(pcs)
    union_scores = union_scores.annotate(
        original=(union_scores.s == 'HG02238')
        | (union_scores.s == 'NA12248')
        | (union_scores.s == 'NA20502'),
        reprocessed=union_scores.s.contains('reprocessed'),
    )
    expr = (
        hl.case().when(
            (union_scores.original)
            & (
                union_scores.reprocessed  # pylint: disable=singleton-comparison
                == False  # noqa: E712
            ),
            'original',
        ).when(
            (union_scores.original == False)  # pylint: disable=singleton-comparison
            & (union_scores.reprocessed),
            'reprocessed',
        ).default('unedited'))
    union_scores = union_scores.annotate(cohort_sample_codes=expr)
    # get percentage of variance explained
    eigenvalues = hl.import_table(EIGENVALUES)
    eigenvalues = eigenvalues.to_pandas()
    eigenvalues.columns = ['eigenvalue']
    eigenvalues = pd.to_numeric(eigenvalues.eigenvalue)
    variance = eigenvalues.divide(float(eigenvalues.sum())) * 100
    variance = variance.round(2)

    # plot
    labels = union_scores.cohort_sample_codes
    sample_names = union_scores.s
    cohort_sample_codes = list(set(labels.collect()))
    tooltips = [('labels', '@label'), ('samples', '@samples')]
    for i in range(0, 10):
        pc1 = i
        pc2 = i + 1
        plot_filename = (f'{output}/reprocessed_sample_projection_pc' +
                         str(i + 1) + '.png')
        if not hl.hadoop_exists(plot_filename):
            plot = figure(
                title='Reprocessed Sample Projection',
                x_axis_label='PC' + str(pc1 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                y_axis_label='PC' + str(pc2 + 1) + ' (' + str(variance[pc1]) +
                '%)',
                tooltips=tooltips,
            )
            source = ColumnDataSource(
                dict(
                    x=union_scores.scores[pc1].collect(),
                    y=union_scores.scores[pc2].collect(),
                    label=labels.collect(),
                    samples=sample_names.collect(),
                ))
            plot.circle(
                'x',
                'y',
                alpha=0.5,
                source=source,
                size=8,
                color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                                  cohort_sample_codes),
                legend_group='label',
            )
            plot.add_layout(plot.legend[0], 'left')
            with hl.hadoop_open(plot_filename, 'wb') as f:
                get_screenshot_as_png(plot).save(f, format='PNG')
            plot_filename_html = ('reprocessed_sample_projection_pc' +
                                  str(i + 1) + '.html')
            output_file(plot_filename_html)
            save(plot)
            subprocess.run(['gsutil', 'cp', plot_filename_html, output],
                           check=False)
def query():  # pylint: disable=too-many-locals
    """Query script entry point."""

    hl.init(default_reference='GRCh38')

    snp_chip = hl.read_matrix_table(SNP_CHIP)
    tob_wgs = hl.read_matrix_table(TOB_WGS)
    tob_wgs = hl.experimental.densify(tob_wgs)
    tob_wgs = tob_wgs.annotate_entries(GT=lgt_to_gt(tob_wgs.LGT, tob_wgs.LA))
    snp_chip = snp_chip.semi_join_rows(tob_wgs.rows())
    snp_chip_path = output_path('snp_chip_filtered_by_tob_wgs.mt', 'tmp')
    snp_chip = snp_chip.checkpoint(snp_chip_path)

    # Perform PCA
    eigenvalues, scores, loadings = hl.hwe_normalized_pca(
        snp_chip.GT, compute_loadings=True, k=5)

    scores_path = output_path('scores.ht', 'tmp')
    loadings_path = output_path('loadings.ht', 'tmp')
    scores = scores.checkpoint(scores_path)
    loadings = loadings.checkpoint(loadings_path)

    # make tob_wgs rows equivalent to the snp_chip rows
    tob_wgs = tob_wgs.semi_join_rows(snp_chip.rows())
    tob_wgs_path = output_path('tob_wgs_filtered_by_snp_chip.mt', 'tmp')
    tob_wgs = tob_wgs.checkpoint(tob_wgs_path)
    snp_chip = snp_chip.annotate_rows(
        af=hl.agg.mean(snp_chip.GT.n_alt_alleles()) / 2)
    loadings = loadings.annotate(af=snp_chip.rows()[loadings.key].af)
    # project WGS samples onto PCA
    ht = pc_project(tob_wgs.GT, loadings.loadings, loadings.af)
    ht_path = output_path('pc_project_tob_wgs.ht', 'tmp')
    ht = ht.checkpoint(ht_path)
    scores = scores.key_by(s=scores.s + '_snp_chip')
    union_scores = ht.union(scores)
    variance = [(x / sum(eigenvalues) * 100) for x in eigenvalues]
    variance = [round(x, 2) for x in variance]

    # Get partner sample information
    sample_names = union_scores.s.collect()

    def sample_type(sample_name):
        if sample_name.endswith('snp_chip'):
            partner_name = re.sub('_snp_chip', '', sample_name)
            tech = 'snp'
        else:
            partner_name = sample_name + '_snp_chip'
            tech = 'wgs'

        if partner_name in sample_names:
            prefix = 'dual_'
        else:
            prefix = ''

        return prefix + tech

    # plot
    labels = list(map(sample_type, sample_names))
    cohort_sample_codes = list(set(labels))
    tooltips = [('labels', '@label'), ('samples', '@samples')]

    # Get number of PCs
    number_of_pcs = len(eigenvalues)

    for i in range(0, (number_of_pcs - 1)):
        pc1 = i
        pc2 = i + 1
        plot = figure(
            title='TOB-WGS + TOB SNP Chip',
            x_axis_label=f'PC{pc1 + 1} ({variance[pc1]})%)',
            y_axis_label=f'PC{pc2 + 1} ({variance[pc2]}%)',
            tooltips=tooltips,
        )
        source = ColumnDataSource(
            dict(
                x=union_scores.scores[pc1].collect(),
                y=union_scores.scores[pc2].collect(),
                label=labels,
                samples=sample_names,
            ))
        source_filename = output_path(f'source_{pc2}.ht', 'tmp')
        hl.Table.from_pandas(pd.DataFrame(source.data)).export(source_filename)
        plot.circle(
            'x',
            'y',
            alpha=0.5,
            source=source,
            size=8,
            color=factor_cmap('label', Dark2[len(cohort_sample_codes)],
                              cohort_sample_codes),
            legend_group='label',
        )
        plot.add_layout(plot.legend[0], 'left')
        plot_filename = output_path(f'pc{pc2}.png', 'web')
        with hl.hadoop_open(plot_filename, 'wb') as f:
            get_screenshot_as_png(plot).save(f, format='PNG')
        html = file_html(plot, CDN, 'my plot')
        plot_filename_html = output_path(f'pc{pc2}.html', 'web')
        with hl.hadoop_open(plot_filename_html, 'w') as f:
            f.write(html)