Ejemplo n.º 1
0
def process_intersections(data_dicts,
                          save_path,
                          item_key='item',
                          is_rewrite=True):
    cpg_dicts = get_cpg_dicts(data_dicts, item_key)

    for dataset, data_dict in data_dicts.items():
        save_table_dict_xlsx(f'{save_path}/{dataset}', data_dict, is_rewrite)

    sets, sets_with_difference = get_sets(data_dicts, item_key)

    save_dicts = get_cpg_dataset_save_dicts(sets, data_dicts, cpg_dicts,
                                            item_key)
    curr_save_path = f'{save_path}/intersection_full'
    if not os.path.exists(curr_save_path):
        os.makedirs(curr_save_path)
    for key, save_dict in save_dicts.items():
        save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict, is_rewrite)

    save_dicts_with_diff = get_cpg_dataset_save_dicts(sets_with_difference,
                                                      data_dicts, cpg_dicts,
                                                      item_key)
    curr_save_path = f'{save_path}/intersection_diff'
    if not os.path.exists(curr_save_path):
        os.makedirs(curr_save_path)
    venn_labels = []
    for key, save_dict in save_dicts_with_diff.items():
        save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict, is_rewrite)
        curr_labels = key.split('_') + [str(len(sets_with_difference[key]))]
        venn_labels.append('<br>'.join(curr_labels))

    if len(data_dicts) == 4:
        layout = get_layout_4()
        trace = get_trace_4(venn_labels)
    elif len(data_dicts) == 3:
        layout = get_layout_3()
        trace = get_trace_3(venn_labels)
    elif len(data_dicts) == 2:
        layout = get_layout_2()
        trace = get_trace_2(venn_labels)
    else:
        raise ValueError(f'Venn diagram is not supported')

    fig = {
        'data': [trace],
        'layout': layout,
    }

    save_figure(f'{save_path}/venn', fig)

    return save_dicts, save_dicts_with_diff
from paper.routines.infrastructure.load.table import load_table_dict_xlsx, load_table_dict_pkl
from paper.routines.infrastructure.save.table import save_table_dict_xlsx, save_table_dict_pkl
from statsmodels.stats.multitest import multipletests

path = 'E:/YandexDisk/Work/pydnameth/unn_epic/bop/table/manova/3c48cd40ad58b06cc3b1f27e3c72554c'
fn = 'ABC'
target_metrics = ['Sample_Group_p_value_roy_3c48cd40']
limit = 0.05

table = {}
table['Number of'] = ['BoPs', 'Genes']

curr_fn = f'{path}/{fn}.xlsx'
data = load_table_dict_xlsx(curr_fn)

for metric in target_metrics:

    reject, pvals_corr, alphacSidak, alphacBonf = multipletests(
        data[metric], 0.05, method='fdr_bh')
    data[f'{metric}_fdr_bh'] = pvals_corr

    reject, pvals_corr, alphacSidak, alphacBonf = multipletests(
        data[metric], 0.05, method='bonferroni')
    data[f'{metric}_bonferroni'] = pvals_corr

save_table_dict_xlsx(f'{path}/{fn}_mod', data)
save_table_dict_pkl(f'{path}/{fn}_mod', data)
Ejemplo n.º 3
0
papers_keys = ['inoshita', 'singmann', 'yousefi']

path = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni'

data_dicts_passed = {}
cpgs_dicts_passed = {}
R2s = {}
R2_percentiles = {}

data_dict = load_table_dict_xlsx(f'{path}/{name}.xlsx')

for key in annotations_keys:
    data_dict[key] = []
for key in papers_keys:
    data_dict[key] = []

annotations_dict = load_annotations_dict()
papers_dict = load_papers_dict()

for cpg in tqdm(data_dict[cpg_key], desc=f'intersection processing'):
    for key in annotations_keys:
        data_dict[key].append(annotations_dict[key][cpg])

    for paper_key in papers_keys:
        if cpg in papers_dict[paper_key]:
            data_dict[paper_key].append(1)
        else:
            data_dict[paper_key].append(0)

save_table_dict_xlsx(f'{path}/{name}_with_added_info', data_dict)
Ejemplo n.º 4
0
def get_human_plasma_proteome_dicts(save_path):

    lehallier_data_path = f'{get_data_path()}/human_plasma_proteome'
    fn = lehallier_data_path + '/' + 'proteins_genes.xlsx'
    proteins_genes_data_dict = load_table_dict_xlsx(fn)

    id_gene = {}
    gene_id = {}
    suspect_rows = []
    suspect_ids = []
    for row_id in tqdm(range(0, len(proteins_genes_data_dict['ID']))):
        id = proteins_genes_data_dict['ID'][row_id]
        gene = proteins_genes_data_dict['EntrezGeneSymbol'][row_id]

        if gene in gene_id:
            gene_id[gene].append(id)
        else:
            gene_id[gene] = [id]

        if id in id_gene:
            suspect_rows.append(row_id)
            suspect_ids.append(id)
        if isinstance(gene, str):
            id_gene[id] = gene
        else:
            suspect_rows.append(row_id)
            suspect_ids.append(id)
    suspect_rows = [x + 2 for x in suspect_rows]
    np.savetxt(f'{save_path}/suspect_rows.txt', suspect_rows, fmt='%d')
    np.savetxt(f'{save_path}/suspect_ids.txt', suspect_ids, fmt='%s')

    fn = lehallier_data_path + '/' + 'age_sex.xlsx'
    age_sex_data_dict = load_table_dict_xlsx(fn)
    id_age_q = {}
    id_sex_q = {}

    for row_id in range(0, len(age_sex_data_dict['ID'])):
        id = age_sex_data_dict['ID'][row_id]
        age_q = age_sex_data_dict['q.Age'][row_id]
        sex_q = age_sex_data_dict['q.Sex'][row_id]

        id_age_q[id] = age_q
        id_sex_q[id] = sex_q

    ar_genes_lehallier = []
    ss_genes_lehallier = []
    ssar_genes_lehallier = []
    for id, gene in id_gene.items():
        if id_age_q[id] < 0.05:
            ar_genes_lehallier.append(gene)
        if id_sex_q[id] < 0.05:
            ss_genes_lehallier.append(gene)
        if id_age_q[id] < 0.05 and id_sex_q[id] < 0.05:
            ssar_genes_lehallier.append(gene)

    print(
        f'Number of ss genes in Lehallier, et. al.: {len(ss_genes_lehallier)}')
    print(
        f'Number of UNIQUE ss genes in Lehallier, et. al.: {len(set(ss_genes_lehallier))}'
    )
    genes_duplicates = [
        item
        for item, count in collections.Counter(ss_genes_lehallier).items()
        if count > 1
    ]
    genes_duplicates_str = {'id': [], 'gene': []}
    for gene in genes_duplicates:
        ids = gene_id[gene]
        for id in ids:
            genes_duplicates_str['id'].append(id)
            genes_duplicates_str['gene'].append(gene)
    save_table_dict_xlsx(f'{save_path}/duplicates_ss', genes_duplicates_str)

    print(
        f'Number of ar genes in Lehallier, et. al.: {len(ar_genes_lehallier)}')
    print(
        f'Number of UNIQUE ar genes in Lehallier, et. al.: {len(set(ar_genes_lehallier))}'
    )
    genes_duplicates = [
        item
        for item, count in collections.Counter(ar_genes_lehallier).items()
        if count > 1
    ]
    genes_duplicates_str = {'id': [], 'gene': []}
    for gene in genes_duplicates:
        ids = gene_id[gene]
        for id in ids:
            genes_duplicates_str['id'].append(id)
            genes_duplicates_str['gene'].append(gene)
    save_table_dict_xlsx(f'{save_path}/duplicates_ar', genes_duplicates_str)

    print(
        f'Number of ssar genes in Lehallier, et. al.: {len(ssar_genes_lehallier)}'
    )
    print(
        f'Number of UNIQUE ssar genes in Lehallier, et. al.: {len(set(ssar_genes_lehallier))}'
    )
    genes_duplicates = [
        item
        for item, count in collections.Counter(ssar_genes_lehallier).items()
        if count > 1
    ]
    genes_duplicates_str = {'id': [], 'gene': []}
    for gene in genes_duplicates:
        ids = gene_id[gene]
        for id in ids:
            genes_duplicates_str['id'].append(id)
            genes_duplicates_str['gene'].append(gene)
    save_table_dict_xlsx(f'{save_path}/duplicates_ssar', genes_duplicates_str)

    return ss_genes_lehallier, ar_genes_lehallier, ssar_genes_lehallier
Ejemplo n.º 5
0
datasets = ['GSE40279', 'GSE87571', 'EPIC', 'GSE55763']

for dataset in datasets:

    print(dataset)

    source_fn = f'E:/YandexDisk/Work/pydnameth/approaches/ancova/Treatment/{dataset}.xlsx'
    source_keys = ['x:category_pval', 'x:category']

    target_fn = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni/ssDMPs_ext.xlsx'
    target_main_key = 'MarkerName'
    target_keys = [
        f'interaction p-value ({dataset})', f'interaction coeff ({dataset})'
    ]

    save_fn = 'E:/YandexDisk/Work/pydnameth/draft/fixes/materials_and_methods/update_4_bonferroni/ssDMPs_ext'

    source_dict = load_table_dict_by_key_xlsx(source_fn, 'item')
    target_dict = load_table_dict_xlsx(target_fn)

    for key in target_keys:
        target_dict[key] = []
    for item in target_dict[target_main_key]:
        for key_id, key in enumerate(target_keys):
            if item in source_dict[source_keys[key_id]]:
                target_dict[key].append(source_dict[source_keys[key_id]][item])
            else:
                target_dict[key].append('NA')

    save_table_dict_xlsx(save_fn, target_dict)
Ejemplo n.º 6
0
def process_human_plasma_proteome(target_dict,
                                  proteomic_genes,
                                  save_path,
                                  aux_key='aux'):

    fn_exp = 'E:/YandexDisk/Work/pydnameth/human_plasma_proteome/GTEx'
    exp_dict = load_table_dict(fn_exp)
    for tissue in exp_dict:
        if tissue not in ['Name', 'Description']:
            exp_dict[tissue] = np.log10(np.asarray(exp_dict[tissue]))

    genes = {}
    for dataset in target_dict:
        for key in target_dict[dataset]:
            if 'aux_' in key:
                aux_key = key
                break
        genes[dataset] = {'gene': get_genes(target_dict[dataset], aux_key)}

    genes['Proteomic'] = {'gene': proteomic_genes}

    for dataset in genes:
        tmp_key = 'gene'
        print(f'num genes in {dataset}: {len(genes[dataset][tmp_key])}')

    sets, sets_with_difference = get_sets(genes, item_key='gene')

    curr_save_path = f'{save_path}/intersection_full'
    if not os.path.exists(curr_save_path):
        os.makedirs(curr_save_path)
    for set_key in sets:
        save_dict = {}
        for metrics_key in ['gene']:
            save_dict[metrics_key] = []
        for i in sets[set_key]:
            save_dict['gene'].append(i)
        save_table_dict_xlsx(f'{curr_save_path}/{set_key}', save_dict)
        gtex_processing(exp_dict, sets[set_key], set_key, curr_save_path)

    curr_save_path = f'{save_path}/intersection_diff'
    if not os.path.exists(curr_save_path):
        os.makedirs(curr_save_path)
    venn_labels = []
    for set_key in sets_with_difference:
        save_dict = {}
        for metrics_key in ['gene']:
            save_dict[metrics_key] = []
        for i in sets_with_difference[set_key]:
            save_dict['gene'].append(i)
        save_table_dict_xlsx(f'{curr_save_path}/{set_key}', save_dict)
        curr_labels = set_key.split('_') + [
            str(len(sets_with_difference[set_key]))
        ]
        venn_labels.append('<br>'.join(curr_labels))

    if len(genes) == 4:
        layout = get_layout_4()
        trace = get_trace_4(venn_labels)
    elif len(genes) == 3:
        layout = get_layout_3()
        trace = get_trace_3(venn_labels)
    elif len(genes) == 2:
        layout = get_layout_2()
        trace = get_trace_2(venn_labels)
    else:
        raise ValueError(f'Venn diagram is not supported')

    fig = {
        'data': [trace],
        'layout': layout,
    }

    save_figure(f'{save_path}/venn', fig)
Ejemplo n.º 7
0
def gtex_processing(exp_dict, genes, main_key, save_path, is_plot=False):

    gene_id_dict = dict(
        zip(exp_dict['Description'],
            list(range(0, len(exp_dict['Description'])))))

    result_dict = {key: [] for key in exp_dict}
    for gene in genes:
        if gene in gene_id_dict:
            row_id = gene_id_dict[gene]
            for key in result_dict:
                result_dict[key].append(exp_dict[key][row_id])

    save_table_dict_xlsx(f'{save_path}/{main_key}_expression', result_dict)

    if is_plot:

        target_keys = ['Whole Blood', 'Liver', 'Brain - Frontal Cortex (BA9)']
        plot_data = []
        for t_id, tissue in enumerate(target_keys):
            if len(result_dict[tissue]) > 0:
                xs, ys = get_pdf_x_and_y(result_dict[tissue], num_bins=50)
                color = cl.scales['8']['qual']['Set1'][t_id]
                coordinates = color[4:-1].split(',')
                color_border = 'rgba(' + ','.join(coordinates) + ',' + str(
                    0.8) + ')'
                scatter = go.Scatter(x=xs,
                                     y=ys,
                                     name=tissue,
                                     mode='lines',
                                     line=dict(width=4, color=color_border),
                                     showlegend=True)
                plot_data.append(scatter)
        layout = get_layout('$log_{2}GTEX$', 'Probability density function')
        fn = f'{save_path}/{main_key}'
        figure = go.Figure(data=plot_data, layout=layout)
        plotly.offline.plot(figure,
                            filename=f'{fn}.html',
                            auto_open=False,
                            show_link=True)
        pio.write_image(figure, f'{fn}.png')
        pio.write_image(figure, f'{fn}.pdf')

        traces = []
        base_order = []

        color_scales = [
            px.colors.sequential.Reds[2:-2], px.colors.sequential.Blues[2:-2],
            px.colors.sequential.Greens[2:-2]
        ]
        for t_id, tissue in enumerate(target_keys):
            if len(result_dict[tissue]) > 0:
                target_genes = result_dict['Description']
                target_exp = result_dict[tissue]
                if t_id == 0:
                    base_order = np.argsort(target_exp)[::-1]
                genes_sorted = list(np.array(target_genes)[base_order])
                exp_sorted = list(np.array(target_exp)[base_order])

                traces.append(
                    go.Bar(orientation='h',
                           name=tissue,
                           y=genes_sorted,
                           x=[x + 4 for x in exp_sorted],
                           base=-4,
                           marker=dict(
                               color=[x + 4 for x in exp_sorted],
                               colorscale=color_scales[t_id],
                               colorbar=dict(
                                   showticklabels=False,
                                   len=1,
                                   x=1 + 0.1 * t_id,
                                   title=dict(
                                       text=tissue.replace(' ', '<br>'),
                                       font=dict(size=12,
                                                 color=color_scales[t_id][-1]),
                                       side='right'),
                               ),
                               showscale=True)))

        layout = go.Layout(
            plot_bgcolor='rgba(233,233,233,0)',
            barmode='overlay',
            showlegend=False,
            autosize=False,
            margin=go.layout.Margin(l=10, r=10, b=10, t=10, pad=0),
            height=15 * (len(base_order) + 1),
            width=1000,
            xaxis=dict(
                gridcolor='rgb(100, 100, 100)',
                #gridwidth=0.01,
                mirror=True,
                linecolor='black',
                title='$log_{10}GTEX$',
                autorange=False,
                range=[-4, 5],
                showgrid=False,
                showline=True,
                titlefont=dict(family='Arial', size=30, color='black'),
                showticklabels=True,
                tickangle=0,
                tickfont=dict(family='Arial', size=15, color='black'),
                exponentformat='e',
                showexponent='all',
            ),
            yaxis=dict(
                gridcolor='rgb(100, 100, 100)',
                mirror=True,
                linecolor='black',
                autorange=True,
                showgrid=False,
                showline=True,
                tickangle=0,
                titlefont=dict(family='Arial', size=10, color='black'),
                showticklabels=True,
                tickfont=dict(family='Arial', size=10, color='black'),
                exponentformat='e',
                showexponent='all',
            ),
        )

        traces = traces[::-1]

        fn = f'{save_path}/{main_key}_combo'
        fig = go.Figure(data=traces, layout=layout)
        fig.update_layout(barmode='group')
        plotly.offline.plot(fig,
                            filename=fn + '.html',
                            auto_open=False,
                            show_link=True)
        pio.write_image(fig, fn + '.png')
        pio.write_image(fig, fn + '.pdf')
Ejemplo n.º 8
0
        data_dict = load_table_dict_xlsx(f'{path}/{dataset}.xlsx')

        data_dict_passed = {}
        for key in data_dict:
            data_dict_passed[key] = []

        num_cpgs = len(data_dict[cpg_key])

        for cpg_id in tqdm(range(0, num_cpgs), desc=f'{dataset} processing'):
            is_passed = check_condition(data_dict[area_criteria_key][cpg_id],
                                        data_dict[slope_criteria_key][cpg_id])
            if is_passed:
                for key in data_dict:
                    data_dict_passed[key].append(data_dict[key][cpg_id])

        save_table_dict_xlsx(f'{path}/{dataset}_passed', data_dict_passed)

    data_dicts_passed[dataset] = data_dict_passed
    cpgs_dicts_passed[dataset] = data_dict_passed[cpg_key]

datasets_ids = list(range(0, len(datasets)))
keys_ordered = copy.deepcopy(datasets)
sets = {}
checking = {}
for dataset in datasets:
    sets[dataset] = set(cpgs_dicts_passed[dataset])
    checking[dataset] = 0

for L in range(2, len(datasets) + 1):
    for subset in itertools.combinations(datasets_ids, L):
    result['corr_coeff'].append(corr_coeff)
    result['p_value'].append(p_value)

    result['item'].append(cpg)
    aux = ''
    if cpg in config_unn.cpg_gene_dict:
        aux = ';'.join(config_unn.cpg_gene_dict[cpg])
    result['aux_unn'].append(aux)
    aux = ''
    if cpg in config_other.cpg_gene_dict:
        aux = ';'.join(config_other.cpg_gene_dict[cpg])
    result['aux_other'].append(aux)

pvals = np.asarray(result['p_value'])
reject, pvals_corr, alphacSidak, alphacBonf = multipletests(
    pvals,
    0.05,
    method='fdr_bh'
)
result['p_value_benjamini_hochberg'] = pvals_corr

reject, pvals_corr, alphacSidak, alphacBonf = multipletests(
    pvals,
    0.05,
    method='bonferroni'
)
result['p_value_bonferroni'] = pvals_corr

save_table_dict_xlsx(f'{save_path}/pbc_vs_GSE87571', result)
Ejemplo n.º 10
0
        cpg_map_info_dict = subset['cpg_map_info_dict']

        cpg_dict = {x: 0 for x in cpg_list}

        table_dict = load_table_dict_xlsx(
            f'{path}/{dataset}/{dataset}_{data_type}.xlsx')

        filtered = {x: [] for x in table_dict.keys()}
        for cpg_id, cpg in tqdm(enumerate(table_dict['CpG'])):
            if cpg in cpg_dict:
                for key in table_dict:
                    filtered[key].append(table_dict[key][cpg_id])

        filtered = add_info_to_dict(filtered)

        save_table_dict_xlsx(
            f'{path}/{dataset}/{dataset}_{data_type}_filtered', filtered)
        save_table_dict_pkl(f'{path}/{dataset}/{dataset}_{data_type}_filtered',
                            filtered)

    all_data[dataset] = filtered

    transforms = {x: 'lin' for x in metrics}

    values[dataset] = {}
    xs[dataset] = {}
    ys[dataset] = {}
    for key in metrics:
        if 'P.Val' in key:
            values[dataset][key] = -np.log10(
                np.asarray(filtered[key])[np.nonzero(filtered[key])])
            labels[key] = f'-lоg({key})'
Ejemplo n.º 11
0
        else:
            if opt != '':
                count_global_mod[opt] = count_global[opt]
                count_target_mod[opt] = count_target[opt]

        if opt == '':
            count_global_mod['NA'] = count_global[opt]
            count_target_mod['NA'] = count_target[opt]
            opt = 'NA'

        odds_ratio, p_value = perform_fisher(count_target_mod[opt], count_global_mod[opt], target_num, global_num)
        odds_ratios[opt] = odds_ratio
        p_values[opt] = p_value

    res_table_dict = defaultdict(list)
    for opt in orders[var]:
        res_table_dict[var].append(opt)
        res_table_dict['number of probes'].append(count_target_mod[opt])
        res_table_dict['total number of probes'].append(count_global_mod[opt])
        res_table_dict['p-value'].append(p_values[opt])
        res_table_dict['odds ratio'].append(odds_ratios[opt])

    if not os.path.exists(save_path):
        os.makedirs(save_path)
    save_table_dict_xlsx(f'{save_path}/{var}', res_table_dict)

    x_data = res_table_dict[var]
    y_data = list(map(float, res_table_dict['odds ratio']))

    odds_ratio_plot(x_data, y_data, f'{save_path}/{var}')
Ejemplo n.º 12
0
        f'max_abs_slope_{get_polygon_hash(dataset)}',
        f'slope_{get_linreg_female_hash(dataset)}',
        f'slope_{get_linreg_male_hash(dataset)}'
    ]

save_path = f'{get_data_path()}/approaches/sex_specific_not_age_related/{type}'
if not os.path.exists(save_path):
    os.makedirs(save_path)

data_dicts = get_data_dicts(datasets, 'aggregator', keys_load, keys_save,
                            get_approach_1_hash, check_condition)

cpg_dicts = get_cpg_dicts(data_dicts)

for dataset, data_dict in data_dicts.items():
    save_table_dict_xlsx(f'{save_path}/{dataset}', data_dict)

sets, sets_with_difference = get_sets(datasets, data_dicts)

save_dicts = get_cpg_dataset_save_dicts(sets, data_dicts, cpg_dicts)
curr_save_path = f'{save_path}/intersection'
if not os.path.exists(curr_save_path):
    os.makedirs(curr_save_path)
for key, save_dict in save_dicts.items():
    save_table_dict_xlsx(f'{curr_save_path}/{key}', save_dict)

save_dicts = get_cpg_dataset_save_dicts(sets_with_difference, data_dicts,
                                        cpg_dicts)
curr_save_path = f'{save_path}/intersection_with_difference'
if not os.path.exists(curr_save_path):
    os.makedirs(curr_save_path)