def save_simple_linreg(config, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names, gene_vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(gene_names)): vals = gene_vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals, attributes) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order = np.argsort(list(map(abs, r_values)))[::-1] p_values = list(np.array(p_values)[order]) r_values = list(np.array(r_values)[order]) slopes = list(np.array(slopes)[order]) intercepts = list(np.array(intercepts)[order]) gene_names = list(np.array(gene_names)[order]) fn = 'metrics.txt' fn = get_result_path(config, fn) save_features(fn, [gene_names, p_values, r_values, slopes, intercepts])
def save_plane_linreg(config, num_top=100, gd_type_x=GeneDataType.mean, gd_type_y=GeneDataType.mean): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names = load_top_gene_names(config, num_top) config.scenario = Scenario.validation config.approach_gd = gd_type_x gene_vals_x = load_top_gene_vals(config, gene_names) config.approach_gd = gd_type_y gene_vals_y = load_top_gene_vals(config, gene_names) p_values_x = [] r_values_x = [] p_values_y = [] r_values_y = [] for id in range(0, len(gene_names)): vals_main = gene_vals_x[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals_main, attributes) p_values_x.append(p_value) r_values_x.append(r_value) vals_aux = gene_vals_y[id] slope, intercept, r_value, p_value, std_err = stats.linregress( vals_aux, attributes) p_values_y.append(p_value) r_values_y.append(r_value) fn = 'plane.txt' fn = get_result_path(config, fn) save_features(fn, [gene_names, r_values_x, r_values_y])
def save_params_enet(config, num_folds=10): attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) param_names, param_values = get_enet_params(attributes, vals, num_folds) fn = 'params.txt' fn = get_param_path(config, fn) save_features(fn, [param_names, param_values])
def save_error_from_age(config, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach names, vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation X = vals y = attributes model = linreg_mult(y, X) ages = [] maes = [] str_list = [] x_all = [] y_all = [] for age in range(0, 150): indexes = [i for i, x in enumerate(attributes) if x == age] if len(indexes) > 0: ages.append(age) X_test = np.array(vals).T[indexes].tolist() y_test_pred = model.get_prediction(X_test).predicted_mean curr_str = str(age) mae = 0 for pred_age in y_test_pred: mae += abs(pred_age - age) curr_str += (' ' + str(format(pred_age, '0.8e'))) x_all.append(age) y_all.append(pred_age - age) mae /= len(indexes) str_list.append(curr_str) maes.append(mae) fn = 'error_from_age.txt' fn = get_result_path(config, fn) save_features(fn, [ages, maes]) fn = 'errors.txt' fn = get_result_path(config, fn) np.savetxt(fn, str_list, fmt="%s") slope, intercept, r_value, p_value, std_err = stats.linregress(x_all, y_all) print('slope: ' + str(slope)) print('intercept: ' + str(intercept)) print('r_value: ' + str(r_value)) print('p_value: ' + str(p_value)) print('std_err: ' + str(std_err))
def save_params_enet(config, num_folds=10): attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) param_names, param_values = get_enet_params(vals_passed, attributes, num_folds) fn = 'params.txt' fn = get_param_path(config, fn) save_features(fn, [param_names, param_values])
def get_attributes_group(config, attribute=Attribute.age): attributes = get_attributes(config, attribute) min_atr = min(attributes) max_atr = max(attributes) min_atr = int(min_atr / config.shift) * config.shift max_atr = (int(max_atr / config.shift) + 1) * config.shift group = [] for age_id in range(0, len(attributes)): age = attributes[age_id] key = int((age - min_atr) / config.shift) group.append(key) return group
def get_raw_dict(config): dict_cpg_gene = get_dict_cpg_gene(config) dict_cpg_map = get_dict_cpg_map_info(config) attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) cpg_non_inc = get_non_inc_cpgs(config) gene_raw_dict = {} map_dict = {} for id in range(0, len(cpgs)): curr_cpg = cpgs[id] curr_vals = vals[id] if curr_cpg not in cpg_non_inc: genes = dict_cpg_gene.get(curr_cpg) map_info = dict_cpg_map.get(curr_cpg) if genes is not None: for gene in genes: if gene in gene_raw_dict: for list_id in range(0, len(attributes)): gene_raw_dict[gene][list_id].append( curr_vals[list_id]) map_dict[gene].append(int(map_info)) else: gene_raw_dict[gene] = [] for list_id in range(0, len(attributes)): gene_raw_dict[gene].append([curr_vals[list_id]]) map_dict[gene] = [] map_dict[gene].append(int(map_info)) for gene in gene_raw_dict: raw = gene_raw_dict[gene] map_info = map_dict[gene] order = np.argsort(map_info) gene_raw_dict[gene] = [] for record in raw: sorted_record = list(np.array(record)[order]) gene_raw_dict[gene].append(sorted_record) return gene_raw_dict
def get_attributes_dict(config): attributes = get_attributes(config) min_atr = min(attributes) max_atr = max(attributes) min_atr = int(min_atr / config.shift) * config.shift max_atr = (int(max_atr / config.shift) + 1) * config.shift attributes_dict = {} for age_id in range(0, len(attributes)): age = attributes[age_id] key = int((age - min_atr) / config.shift) if key in attributes_dict: attributes_dict[key].append(age_id) else: attributes_dict[key] = [age_id] return attributes_dict
def save_top_spearman(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) rhos = [] for id in range(0, len(cpgs)): curr_vals = vals[id] rho, pval = stats.spearmanr(attributes, curr_vals) rhos.append(rho) order = np.argsort(list(map(abs, rhos)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) rhos_sorted = list(np.array(rhos)[order]) genes_sorted = [] rhos_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] rho = rhos_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: genes_sorted.append(gene) rhos_genes.append(rho) cpgs_sorted = cpgs_sorted[0:num_top] rhos_sorted = rhos_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] rhos_genes = rhos_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, rhos_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, rhos_genes]) config.dt = DataType.cpg
def save_simple_linreg_mult(config, num_bootstrap_runs=500, num_top=100): attributes = get_attributes(config) config.scenario = Scenario.approach gene_names, gene_vals = load_top_gene_data(config, num_top) config.scenario = Scenario.validation counts, R2s = R2_from_count(gene_vals, attributes) fn = 'R2s_' + str(num_top) + '.txt' fn = get_result_path(config, fn) save_features(fn, [counts, R2s]) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size metrics_names, metrics_vals = validation_metrics(gene_vals, attributes, test_size, train_size, num_bootstrap_runs) fn = 'metrics_' + str(num_top) + '.txt' fn = get_result_path(config, fn) save_features(fn, [metrics_names, metrics_vals]) print(linreg_mult_with_const(attributes, gene_vals).summary())
def save_top_linreg(config): attributes = get_attributes(config) genes, vals = load_gene_data(config) p_values = [] r_values = [] slopes = [] intercepts = [] for id in range(0, len(genes)): val = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( attributes, val) r_values.append(r_value) p_values.append(p_value) slopes.append(slope) intercepts.append(intercept) order_mean = np.argsort(list(map(abs, r_values)))[::-1] p_values_sorted = list(np.array(p_values)[order_mean]) r_values_sorted = list(np.array(r_values)[order_mean]) slopes_sorted = list(np.array(slopes)[order_mean]) intercepts_sorted = list(np.array(intercepts)[order_mean]) genes_sorted = list(np.array(genes)[order_mean]) metrics_sorted_np = np.asarray(list(map(abs, r_values_sorted))).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, r_values_sorted, p_values_sorted, slopes_sorted, intercepts_sorted ])
def save_top_enet(config, num_bootstrap_runs=10, num_top=500): dict_cpg_gene = get_dict_cpg_gene(config) params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) cpgs_passed, vals_passed = load_cpg_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 cpg_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) cpg_sorted = list(np.array(cpgs_passed)[order]) coef_top = coef_sorted[0:num_top] cpg_top = cpg_sorted[0:num_top] for top_id in range(0, num_top): cpg = cpg_top[top_id] if cpg in cpg_top_dict: cpg_top_dict[cpg] += 1 else: cpg_top_dict[cpg] = 1 bootstrap_id += 1 cpgs = list(cpg_top_dict.keys()) counts = list(cpg_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] cpgs_sorted = list(np.array(cpgs)[order]) counts_sorted = list(np.array(counts)[order]) genes_sorted = [] counts_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] count = counts_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) counts_genes.append(count) fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [cpgs_sorted, counts_sorted]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [genes_sorted, counts_genes]) config.dt = DataType.cpg
def save_top_linreg(config, num_top=500): attributes = get_attributes(config) dict_cpg_gene = get_dict_cpg_gene(config) cpgs, vals = load_cpg_data(config) slopes = [] intercepts = [] rvals = [] pvals = [] for id in range(0, len(cpgs)): curr_vals = vals[id] slope, intercept, r_value, p_value, std_err = stats.linregress( curr_vals, attributes) slopes.append(slope) intercepts.append(intercept) rvals.append(r_value) pvals.append(p_value) order = np.argsort(pvals) cpgs_sorted = list(np.array(cpgs)[order]) pvals_sorted = list(np.array(pvals)[order]) slopes_sorted = list(np.array(slopes)[order]) intercepts_sorted = list(np.array(intercepts)[order]) rvals_sorted = list(np.array(rvals)[order]) genes_sorted = [] pvals_genes = [] slopes_genes = [] intercepts_genes = [] rvals_genes = [] for id in range(0, len(cpgs_sorted)): cpg = cpgs_sorted[id] pval = pvals_sorted[id] slope = slopes_sorted[id] intercept = intercepts_sorted[id] rval = rvals_sorted[id] genes = dict_cpg_gene.get(cpg) for gene in genes: if gene not in genes_sorted: genes_sorted.append(gene) pvals_genes.append(pval) slopes_genes.append(slope) intercepts_genes.append(intercept) rvals_genes.append(rval) cpgs_sorted = cpgs_sorted[0:num_top] pvals_sorted = pvals_sorted[0:num_top] slopes_sorted = slopes_sorted[0:num_top] intercepts_sorted = intercepts_sorted[0:num_top] rvals_sorted = rvals_sorted[0:num_top] genes_sorted = genes_sorted[0:num_top] pvals_genes = pvals_genes[0:num_top] slopes_genes = slopes_genes[0:num_top] intercepts_genes = intercepts_genes[0:num_top] rvals_genes = rvals_genes[0:num_top] fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ cpgs_sorted, pvals_sorted, rvals_sorted, slopes_sorted, intercepts_sorted ]) config.approach_gd = GeneDataType.from_cpg config.dt = DataType.gene fn = 'top.txt' fn = get_result_path(config, fn) save_features(fn, [ genes_sorted, pvals_genes, rvals_genes, slopes_genes, intercepts_genes ]) config.dt = DataType.cpg
def save_top_enet(config, num_bootstrap_runs=100, num_top=500): params_dict = load_params_dict(config) alpha = params_dict.get('alpha') l1_ratio = params_dict.get('l1_ratio') attributes = get_attributes(config) genes_passed, vals_passed = load_gene_data(config) test_size = int(len(attributes) * config.test_part) train_size = len(attributes) - test_size rs = ShuffleSplit(num_bootstrap_runs, test_size, train_size) indexes = np.linspace(0, len(attributes) - 1, len(attributes), dtype=int).tolist() enet_X = np.array(vals_passed).T.tolist() bootstrap_id = 0 gene_top_dict = {} for train_index, test_index in rs.split(indexes): print('bootstrap_id: ' + str(bootstrap_id)) enet = ElasticNet(alpha=alpha, l1_ratio=l1_ratio) enet_X_train = list(np.array(enet_X)[train_index]) enet_X_test = list(np.array(enet_X)[test_index]) enet_y_train = list(np.array(attributes)[train_index]) enet_y_test = list(np.array(attributes)[test_index]) enet = enet.fit(enet_X_train, enet_y_train) coef = enet.coef_ order = np.argsort(list(map(abs, coef)))[::-1] coef_sorted = list(np.array(coef)[order]) gene_sorted = list(np.array(genes_passed)[order]) coef_top = coef_sorted[0:num_top] gene_top = gene_sorted[0:num_top] for top_id in range(0, num_top): gene = gene_top[top_id] if gene in gene_top_dict: gene_top_dict[gene] += 1 else: gene_top_dict[gene] = 1 bootstrap_id += 1 genes = list(gene_top_dict.keys()) counts = list(gene_top_dict.values()) order = np.argsort(list(map(abs, counts)))[::-1] genes_sorted = list(np.array(genes)[order]) counts_sorted = list(np.array(counts)[order]) metrics_sorted_np = np.asarray(counts_sorted).reshape(-1, 1) bandwidth = estimate_bandwidth(metrics_sorted_np) ms = MeanShift(bandwidth=bandwidth, bin_seeding=True) ms.fit(metrics_sorted_np) labels_mean_shift = list(ms.labels_) clusters_mean_shift = clustering_order(labels_mean_shift) af = AffinityPropagation().fit(metrics_sorted_np) labels_affinity_propagation = list(af.labels_) clusters_affinity_prop = clustering_order(labels_affinity_propagation) fn = get_result_path(config, 'top.txt') save_features(fn, [ genes_sorted, clusters_mean_shift, clusters_affinity_prop, counts_sorted ])
def save_bend_linreg(config, limit, pval): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) g_names_l, g_vals_l = load_gene_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) g_names_m, g_vals_m = load_gene_data(config_more) genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] for g_id_l in range(0, len(g_names_l)): g_id_m = g_names_m.index(g_names_l[g_id_l]) vals_l = g_vals_l[g_id_l] vals_m = g_vals_m[g_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): genes_passed.append(g_names_l[g_id_l]) angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) order = np.argsort(angles)[::-1] genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ])
def save_bend_linreg(config, limit, pval, num_opt=1000): config_less = deepcopy(config) age_less(config_less, limit) atr_l = get_attributes(config_less) cpg_names_l, cpg_vals_l = load_cpg_data(config_less) config_more = deepcopy(config) age_more(config_more, limit) atr_m = get_attributes(config_more) cpg_names_m, cpg_vals_m = load_cpg_data(config_more) cpg_gene_dict = get_dict_cpg_gene(config) cpgs_passed = [] genes_passed = [] angles = [] slope_ls = [] intercept_ls = [] r_value_ls = [] p_value_ls = [] std_err_ls = [] slope_ms = [] intercept_ms = [] r_value_ms = [] p_value_ms = [] std_err_ms = [] num_cpgs = 0 for cpg_id_l in range(0, len(cpg_names_l)): cpg_id_m = cpg_names_m.index(cpg_names_l[cpg_id_l]) vals_l = cpg_vals_l[cpg_id_l] vals_m = cpg_vals_m[cpg_id_m] slope_l, intercept_l, r_value_l, p_value_l, std_err_l = stats.linregress( atr_l, vals_l) slope_m, intercept_m, r_value_m, p_value_m, std_err_m = stats.linregress( atr_m, vals_m) angle = abs(slope_l - slope_m) if (max(p_value_l, p_value_m) < pval): cpgs_passed.append(cpg_names_l[cpg_id_l]) genes = cpg_gene_dict.get(cpg_names_l[cpg_id_l]) if len(genes) > 0: if genes[0] == '': genes_passed.append('nan') else: genes_passed.append(";".join(genes)) else: genes_passed.append('nan') angles.append(angle) slope_ls.append(slope_l) intercept_ls.append(intercept_l) r_value_ls.append(r_value_l) p_value_ls.append(p_value_l) std_err_ls.append(std_err_l) slope_ms.append(slope_m) intercept_ms.append(intercept_m) r_value_ms.append(r_value_m) p_value_ms.append(p_value_m) std_err_ms.append(std_err_m) num_cpgs += 1 if num_cpgs % config.print_rate == 0: print('num_cpgs: ' + str(num_cpgs)) order = np.argsort(angles)[::-1][0:num_opt] cpgs_opt = list(np.array(cpgs_passed)[order]) genes_opt = list(np.array(genes_passed)[order]) angles_opt = list(np.array(angles)[order]) slope_ls_opt = list(np.array(slope_ls)[order]) intercept_ls_opt = list(np.array(intercept_ls)[order]) r_value_ls_opt = list(np.array(r_value_ls)[order]) p_value_ls_opt = list(np.array(p_value_ls)[order]) std_err_ls_opt = list(np.array(std_err_ls)[order]) slope_ms_opt = list(np.array(slope_ms)[order]) intercept_ms_opt = list(np.array(intercept_ms)[order]) r_value_ms_opt = list(np.array(r_value_ms)[order]) p_value_ms_opt = list(np.array(p_value_ms)[order]) std_err_ms_opt = list(np.array(std_err_ms)[order]) fn = get_result_path(config, 'bend_' + str(limit) + '.txt') save_features(fn, [ cpgs_opt, genes_opt, angles_opt, slope_ls_opt, intercept_ls_opt, r_value_ls_opt, p_value_ls_opt, std_err_ls_opt, slope_ms_opt, intercept_ms_opt, r_value_ms_opt, p_value_ms_opt, std_err_ms_opt ]) raw_config = Config(db=config.db, dt=config.dt, approach=config.approach, scenario=config.scenario, approach_method=config.approach_method, gender=Gender.any) cpg_str_list = [] cpg_name_raw, cpg_vals_raw = load_cpg_data(raw_config) for cpg in cpgs_opt: cpg_vals = cpg_vals_raw[cpg_name_raw.index(cpg)] curr_cpg_str = cpg for id in range(0, len(cpg_vals)): curr_cpg_str += (' ' + str(format(cpg_vals[id], '0.8e'))) cpg_str_list.append(curr_cpg_str) fn = get_result_path(config, 'bend_data_' + str(limit) + '.txt') np.savetxt(fn, cpg_str_list, fmt="%s")
cpg_condition = CpGCondition.x for gender in genders: print('\t' + gender.value) for geo in geos: print('\t\t' + geo.value) config = Config(db=db, dt=dt, approach=approach, scenario=scenario, gender=gender, geo=geo, cpg_condition=cpg_condition) attributes = get_attributes(config) cpgs, vals = load_cpg_data(config) num_int = 200 int_begin = 0 int_end = 1 int_shift = (int_end - int_begin) / num_int ints = [] pdf = np.zeros(num_int) for int_id in range(0, num_int): ints.append(int_begin + int_id * int_shift + 0.5 * int_shift) for curr_cpg_vals in vals: for beta in curr_cpg_vals: int_id = math.floor((beta - int_begin) * num_int / (int_end - int_begin + 1.0e-8))