def validate_kfolds(self): list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4] aucs = [] dict_gene_idx = {} for idx, gene in enumerate(self.all_genes): dict_gene_idx[gene]=idx dict_paras_auc = {} for kernel_idx in range(len(self.kernels)): for c_idx in range(len(list_c)): dict_paras_auc[(kernel_idx, c_idx)] = 0 skf = StratifiedKFold(n_splits=self.n_folds, shuffle=False) for train_index, test_index in skf.split(np.zeros(len(self.training_labels)), self.training_labels): training_genes_left = [self.training_genes[idx] for idx in train_index] training_indices = [dict_gene_idx[gene] for gene in training_genes_left] training_labels_left = [self.training_labels[idx] for idx in train_index] test_genes_left = [self.training_genes[idx] for idx in test_index] test_indices = [dict_gene_idx[gene] for gene in test_genes_left] test_labels_left = [self.training_labels[idx] for idx in test_index] unknown_genes = [] unknown_genes.extend(test_genes_left) for gene in self.all_genes: if gene not in self.training_genes: unknown_genes.append(gene) unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes] (kernel_idx, c_idx) = self.select_parameters(training_genes=training_genes_left, training_labels=training_labels_left) training_kernel = util.extract_submatrix(training_indices, training_indices, self.kernels[kernel_idx]) unknown_kernel = util.extract_submatrix(unknown_indices, training_indices, self.kernels[kernel_idx]) clf = SVC(C=list_c[c_idx], kernel='precomputed') clf.fit(training_kernel, training_labels_left) scores = clf.decision_function(unknown_kernel) qscores = [] for s in scores[:len(test_indices)]: qscore = float(sum([int(s >= value) for value in scores]))/len(scores) qscores.append(qscore) fpr, tpr, thresholds = metrics.roc_curve(test_labels_left, qscores, pos_label=1) auc = metrics.auc(fpr, tpr) aucs.append(auc) return aucs
def select_parameters(self, training_genes=None, training_labels=None): """Model selection""" list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4] dict_gene_idx = {} for idx, gene in enumerate(self.all_genes): dict_gene_idx[gene]=idx dict_paras_auc = {} for kernel_idx in range(len(self.kernels)): for c_idx in range(len(list_c)): dict_paras_auc[(kernel_idx, c_idx)] = 0 skf = StratifiedKFold(n_splits=3, shuffle=False) for train_index, test_index in skf.split(np.zeros(len(training_labels)), training_labels): training_genes_left = [training_genes[idx] for idx in train_index] training_indices = [dict_gene_idx[gene] for gene in training_genes_left] training_labels_left = [training_labels[idx] for idx in train_index] test_genes_left = [training_genes[idx] for idx in test_index] test_indices = [dict_gene_idx[gene] for gene in test_genes_left] test_labels_left = [training_labels[idx] for idx in test_index] unknown_genes = [] unknown_genes.extend(test_genes_left) for gene in self.all_genes: if gene not in training_genes: unknown_genes.append(gene) unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes] for kernel_idx, kernel in enumerate(self.kernels): training_kernel = util.extract_submatrix(training_indices,training_indices,kernel) unknown_kernel = util.extract_submatrix(unknown_indices,training_indices,kernel) for c_idx, c in enumerate(list_c): clf = SVC(C=c, kernel='precomputed') clf.fit(training_kernel, training_labels_left) scores = clf.decision_function(unknown_kernel) qscores = [] for s in scores[:len(test_indices)]: qscore = float(sum([int(s >= value) for value in scores]))/len(scores) qscores.append(qscore) fpr, tpr, thresholds = metrics.roc_curve(test_labels_left, qscores, pos_label=1) auc = metrics.auc(fpr, tpr) dict_paras_auc[(kernel_idx, c_idx)] += auc return max(dict_paras_auc, key=dict_paras_auc.get)
def validate_leave_one_out(self): list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4] dict_gene_idx = {} for idx, gene in enumerate(self.all_genes): dict_gene_idx[gene] = idx dict_paras_auc = {} for kernel_idx in range(len(self.kernels)): for c_idx in range(len(list_c)): dict_paras_auc[(kernel_idx, c_idx)] = 0 all_qscores = [] for train_g_idx, train_g in enumerate(self.training_genes): print('processing gene ', train_g_idx) training_genes_left = self.training_genes[:] del training_genes_left[train_g_idx] training_indices = [dict_gene_idx[gene] for gene in training_genes_left] training_labels_left = self.training_labels[:] del training_labels_left[train_g_idx] unknown_genes = [train_g] for gene in self.all_genes: if gene not in self.training_genes: unknown_genes.append(gene) unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes] (kernel_idx, c_idx) = self.select_parameters(training_genes=training_genes_left, training_labels=training_labels_left) training_kernel = util.extract_submatrix(training_indices, training_indices, self.kernels[kernel_idx]) unknown_kernel = util.extract_submatrix(unknown_indices, training_indices, self.kernels[kernel_idx]) clf = SVC(C=list_c[c_idx], kernel='precomputed') clf.fit(training_kernel, training_labels_left) scores = clf.decision_function(unknown_kernel) qscore = float(sum([int(scores[0] >= value) for value in scores])) / len(scores) all_qscores.append(qscore) fpr, tpr, thresholds = metrics.roc_curve(self.training_labels, all_qscores, pos_label=1) auc = metrics.auc(fpr, tpr) return auc
def model_selection(list_kernels=None, svm_paras=None, list_labels=None, n_folds=None): kf = cross_validation.KFold(len(list_labels), n_folds=n_folds) list_train_fold_index = [] list_test_fold_index = [] for train_index, test_index in kf: list_train_fold_index.append(train_index) list_test_fold_index.append(test_index) dict_paras = {} for kernel_idx in range(len(list_kernels)): for svm_idx in range(len(svm_paras)): dict_paras[(kernel_idx, svm_idx)] = 0 for fold_idx in range(n_folds): train_list_index = list_train_fold_index[fold_idx] test_list_index = list_test_fold_index[fold_idx] labels_train = [labels[idx] for idx in train_list_index] labels_test = [labels[idx] for idx in test_list_index] for kernel_idx in range(len(list_kernels)): M_tr = util.extract_submatrix(train_list_index, train_list_index, list_kernels[kernel_idx]) M_te = util.extract_submatrix(test_list_index, train_list_index, list_kernels[kernel_idx]) for svm_idx, svm_para in enumerate(svm_paras): clf = svm.SVC(C=svm_para, kernel='precomputed') clf.fit(M_tr, labels_train) y_predict = clf.predict(M_te) acc = accuracy_score(labels_test, y_predict) dict_paras[(kernel_idx, svm_idx)] += acc return max(dict_paras, key=dict_paras.get)
def create_blur_image(image, radius, weight): img, width, height = open_image(image) image_data = get_image_data(img) new_image_data = get_image_data(img) new_color = [] for x in range(width): for y in range(height): image_data_submatrix, height_sub, width_sub = extract_submatrix(x, y, height, width, image_data.copy(), radius, weight) new_color = calculate_new_color(image_data_submatrix, weight, height_sub, width_sub) new_image_data[y][x] = new_color save_new_image(new_image_data, f'test-image-blur-radius-{radius}-weight-{weight}.png') print('Image successfully changed.')
"""Cross validation""" all_avg_accs = [] f = open(save_file, 'w') for ran_idx in range(10): print "Random ", ran_idx shuffle_indices = range(len(pre_labels)) random.shuffle(shuffle_indices) labels = [pre_labels[idx] for idx in shuffle_indices] list_kernels = [] for kernel in pre_list_kernels: list_kernels.append( util.extract_submatrix(shuffle_indices, shuffle_indices, kernel)) kf = cross_validation.KFold(len(labels), n_folds=n_folds) list_train_fold_index = [] list_test_fold_index = [] for train_index, test_index in kf: list_train_fold_index.append(train_index) list_test_fold_index.append(test_index) list_accs = [] for fold_idx in range(n_folds): train_list_index = list_train_fold_index[fold_idx] test_list_index = list_test_fold_index[fold_idx]
def evaluate(adjacency_path=None, node_label_folder=None, all_gene_path=None, train_gene_folder=None, train_label_folder=None, n_iters=None, n_hops=None, n_clusters=None, svm_paras=None, save_folder=None): all_genes = util.load_list_from_file(all_gene_path) number_svm_parameters = len(svm_paras) dict_gene_idx = {} for idx, gene in enumerate(all_genes): dict_gene_idx[gene] = idx graph = util.create_graph(adjacency_path=adjacency_path) for n_cluster in n_clusters: util.node_labeling(g=graph, label_path=node_label_folder + str(n_cluster)) for n_iter in n_iters: WLvect = WLVectorizer(r=n_iter) iters_features = WLvect.transform([graph]) M = iters_features[0][0] for iter_id in range(1, n_iter + 1): M = M + iters_features[iter_id][0] print 'Done WL compuation' sys.stdout.flush() for n_hop in n_hops: print 'Begining DWL compuation' sys.stdout.flush() G = util.deepwl(graph=graph, feature_matrix=M, n_hop=n_hop) print "Size of G", G.shape print 'Done DWL compuation' sys.stdout.flush() for disease_idx in range(12): list_training_genes = util.load_list_from_file( train_gene_folder + str(disease_idx)) list_training_labels = util.load_list_from_file( train_label_folder + str(disease_idx)) list_training_labels = [ int(e) for e in list_training_labels ] list_qscores = [[] for i in range(number_svm_parameters)] for gene_idx, gene in enumerate(list_training_genes): list_training_genes_del = list_training_genes[:] del list_training_genes_del[gene_idx] training_genes_idx = [ dict_gene_idx[g] for g in list_training_genes_del ] list_training_labels_del = list_training_labels[:] del list_training_labels_del[gene_idx] unknown_genes_idx = [dict_gene_idx[gene]] for idx in range(len(all_genes)): if (idx not in training_genes_idx) and ( idx != dict_gene_idx[gene]): unknown_genes_idx.append(idx) Mtr = util.extract_submatrix(training_genes_idx, training_genes_idx, G) M_unknown = util.extract_submatrix( unknown_genes_idx, training_genes_idx, G) for idx_svm, svm_para in enumerate(svm_paras): clf = svm.SVC(C=svm_para, kernel='precomputed') clf.fit(Mtr, list_training_labels_del) scores = clf.decision_function(M_unknown) len_scores = len(scores) qscore = float( sum([int(scores[0] > val) for val in scores])) / len_scores list_qscores[idx_svm].append(qscore) # computing auc save_lines = [] for qscores_idx, qscores in enumerate(list_qscores): fpr, tpr, thresholds = metrics.roc_curve( list_training_labels, qscores, pos_label=1) auc = metrics.auc(fpr, tpr) line = str(n_cluster) + "_" + str(n_iter) + "_" + str( n_hop) + "_" + str(qscores_idx) + ":\t" + str( auc) + "\n" save_lines.append(line) f = open(save_folder + str(disease_idx), 'w') f.writelines(save_lines) f.close()