def get_dict_bop_genes(config, dict_bop_cpgs): dict_cpg_gene = get_dict_cpg_gene(config) dict_bop_genes = {} for bop in dict_bop_cpgs: cpgs = dict_bop_cpgs.get(bop) genes = [] for curr_cpg in cpgs: curr_genes = dict_cpg_gene.get(curr_cpg) genes += curr_genes dict_bop_genes[bop] = list(set(genes)) return dict_bop_genes
def save_top_anova(config, num_top=500): attributes_dict = get_attributes_dict(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] curr_beta_dict = {} for key_age in attributes_dict: curr_beta_dict[key_age] = list( np.asarray(curr_vals)[attributes_dict[key_age]]) anova_res = stats.f_oneway(*curr_beta_dict.values()) pvals.append(anova_res.pvalue) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) genes_sorted = [] pvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, pvals_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, pvals_genes]) config.dt = DataType.cpg
def get_raw_dict(config): dict_cpg_gene = get_dict_cpg_gene(config) dict_cpg_map = get_dict_cpg_map_info(config) attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) cpg_non_inc = get_non_inc_cpgs(config) gene_raw_dict = {} map_dict = {} for id in range(0, len(cpgs)): curr_cpg = cpgs[id] curr_vals = vals[id] if curr_cpg not in cpg_non_inc: genes = dict_cpg_gene.get(curr_cpg) map_info = dict_cpg_map.get(curr_cpg) if genes is not None: for gene in genes: if gene in gene_raw_dict: for list_id in range(0, len(attributes)): gene_raw_dict[gene][list_id].append( curr_vals[list_id]) map_dict[gene].append(int(map_info)) else: gene_raw_dict[gene] = [] for list_id in range(0, len(attributes)): gene_raw_dict[gene].append([curr_vals[list_id]]) map_dict[gene] = [] map_dict[gene].append(int(map_info)) for gene in gene_raw_dict: raw = gene_raw_dict[gene] map_info = map_dict[gene] order = np.argsort(map_info) gene_raw_dict[gene] = [] for record in raw: sorted_record = list(np.array(record)[order]) gene_raw_dict[gene].append(sorted_record) return gene_raw_dict
def save_top_spearman(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) rhos = [] for id in range(0, len(cpgs)): curr_vals = vals[id] rho, pval = stats.spearmanr(attributes, curr_vals) rhos.append(rho) order = np.argsort(list(map(abs, rhos)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) rhos_sorted = list(np.array(rhos)[order]) genes_sorted = [] rhos_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] rho = rhos_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: genes_sorted.append(gene) rhos_genes.append(rho) cpgs_sorted = cpgs_sorted[0:num_top] rhos_sorted = rhos_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] rhos_genes = rhos_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, rhos_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, rhos_genes]) config.dt = DataType.cpg
def load_cpg_data(config): indexes = config.indexes dict_cpg_gene = get_dict_cpg_gene(config) fn = 'average_beta.txt' full_path = get_path(config, fn) f = open(full_path) for skip_id in range(0, config.num_skip_lines): skip_line = f.readline() num_lines = 0 cpgs_passed = [] vals_passed = [] cpg_non_inc = get_non_inc_cpgs(config) for line in f: col_vals = line_proc(config, line) is_none = False if config.miss_tag in col_vals: is_none = True if not is_none: cpg = col_vals[0] vals = list(map(float, col_vals[1::])) vals = list(np.array(vals)[indexes]) if cpg not in cpg_non_inc: if cpg in dict_cpg_gene: vals_passed.append(vals) cpgs_passed.append(cpg) num_lines += 1 if num_lines % config.print_rate == 0: print('num_lines: ' + str(num_lines)) f.close() return cpgs_passed, vals_passed
def save_top_enet(config, num_bootstrap_runs=10, num_top=500): dict_cpg_gene = get_dict_cpg_gene(config) params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) cpgs_passed, vals_passed = load_cpg_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 cpg_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) cpg_sorted = list(np.array(cpgs_passed)[order]) coef_top = coef_sorted[0:num_top] cpg_top = cpg_sorted[0:num_top] for top_id in range(0, num_top): cpg = cpg_top[top_id] if cpg in cpg_top_dict: cpg_top_dict[cpg] += 1 else: cpg_top_dict[cpg] = 1 bootstrap_id += 1 cpgs = list(cpg_top_dict.keys()) counts = list(cpg_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) counts_sorted = list(np.array(counts)[order]) genes_sorted = [] counts_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] count = counts_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) counts_genes.append(count) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, counts_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, counts_genes]) config.dt = DataType.cpg
def save_top_linreg(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) slopes = [] intercepts = [] rvals = [] pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( curr_vals, attributes) slopes.append(slope) intercepts.append(intercept) rvals.append(r_value) pvals.append(p_value) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) slopes_sorted = list(np.array(slopes)[order]) intercepts_sorted = list(np.array(intercepts)[order]) rvals_sorted = list(np.array(rvals)[order]) genes_sorted = [] pvals_genes = [] slopes_genes = [] intercepts_genes = [] rvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] slope = slopes_sorted[id] intercept = intercepts_sorted[id] rval = rvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) slopes_genes.append(slope) intercepts_genes.append(intercept) rvals_genes.append(rval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] slopes_sorted = slopes_sorted[0:num_top] intercepts_sorted = intercepts_sorted[0:num_top] rvals_sorted = rvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] slopes_genes = slopes_genes[0:num_top] intercepts_genes = intercepts_genes[0:num_top] rvals_genes = rvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted, intercepts_sorted ]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes ]) config.dt = DataType.cpg
def save_bend_linreg(config, limit, pval, num_opt=1000): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) cpg_names_l, cpg_vals_l = load_cpg_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) cpg_names_m, cpg_vals_m = load_cpg_data(config_more) cpg_gene_dict = get_dict_cpg_gene(config) cpgs_passed = [] genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] num_cpgs = 0 for cpg_id_l in range(0, len(cpg_names_l)): cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l]) vals_l = cpg_vals_l[cpg_id_l] vals_m = cpg_vals_m[cpg_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): cpgs_passed.append(cpg_names_l[cpg_id_l]) genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l]) if len(genes) > 0: if genes[0] == '': genes_passed.append('nan') else: genes_passed.append(";".join(genes)) else: genes_passed.append('nan') angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) num_cpgs += 1 if num_cpgs % config.print_rate == 0: print('num_cpgs: ' + str(num_cpgs)) order = np.argsort(angles)[::-1][0:num_opt] cpgs_opt = list(np.array(cpgs_passed)[order]) genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ]) raw_config = Config(db=config.db, dt=config.dt, approach=config.approach, scenario=config.scenario, approach_method=config.approach_method, gender=Gender.any) cpg_str_list = [] cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config) for cpg in cpgs_opt: cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)] curr_cpg_str = cpg for id in range(0, len(cpg_vals)): curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e'))) cpg_str_list.append(curr_cpg_str) fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt') np.savetxt(fn, cpg_str_list, fmt="%s")