コード例 #1
0
ファイル: evaluation.py プロジェクト: dinhinfotech/DiGI
    def validate_kfolds(self):
        
        list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4]
        aucs = []
        
        dict_gene_idx = {}
        for idx, gene in enumerate(self.all_genes):
            dict_gene_idx[gene]=idx
            
        dict_paras_auc = {}
        
        for kernel_idx in range(len(self.kernels)):
            for c_idx in range(len(list_c)):
                dict_paras_auc[(kernel_idx, c_idx)] = 0

        skf = StratifiedKFold(n_splits=self.n_folds, shuffle=False)
        for train_index, test_index in skf.split(np.zeros(len(self.training_labels)), self.training_labels):
            training_genes_left = [self.training_genes[idx] for idx in train_index]
            training_indices = [dict_gene_idx[gene] for gene in training_genes_left]
            training_labels_left = [self.training_labels[idx] for idx in train_index]
            
            test_genes_left = [self.training_genes[idx] for idx in test_index]
            test_indices = [dict_gene_idx[gene] for gene in test_genes_left]
            test_labels_left = [self.training_labels[idx] for idx in test_index]
            unknown_genes = []
            unknown_genes.extend(test_genes_left)
            for gene in self.all_genes:
                if gene not in self.training_genes:
                    unknown_genes.append(gene)
            unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes]
            
            (kernel_idx, c_idx) = self.select_parameters(training_genes=training_genes_left, training_labels=training_labels_left)
            
            training_kernel = util.extract_submatrix(training_indices, training_indices, self.kernels[kernel_idx])
            unknown_kernel = util.extract_submatrix(unknown_indices, training_indices, self.kernels[kernel_idx])

            clf = SVC(C=list_c[c_idx], kernel='precomputed')
            clf.fit(training_kernel, training_labels_left)
            
            scores = clf.decision_function(unknown_kernel)
            
            qscores = []
            
            for s in scores[:len(test_indices)]:
                qscore = float(sum([int(s >= value) for value in scores]))/len(scores)
                qscores.append(qscore)
            fpr, tpr, thresholds = metrics.roc_curve(test_labels_left, qscores, pos_label=1)
            auc = metrics.auc(fpr, tpr)
            
            aucs.append(auc)
                
        return aucs
コード例 #2
0
ファイル: evaluation.py プロジェクト: dinhinfotech/DiGI
    def select_parameters(self, training_genes=None, training_labels=None):
        """Model selection"""
        
        list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4]
        
        dict_gene_idx = {}
        for idx, gene in enumerate(self.all_genes):
            dict_gene_idx[gene]=idx
            
        dict_paras_auc = {}
        
        for kernel_idx in range(len(self.kernels)):
            for c_idx in range(len(list_c)):
                dict_paras_auc[(kernel_idx, c_idx)] = 0

        skf = StratifiedKFold(n_splits=3, shuffle=False)
        for train_index, test_index in skf.split(np.zeros(len(training_labels)), training_labels):
            training_genes_left = [training_genes[idx] for idx in train_index]
            training_indices = [dict_gene_idx[gene] for gene in training_genes_left]
            training_labels_left = [training_labels[idx] for idx in train_index]
            test_genes_left = [training_genes[idx] for idx in test_index]
            test_indices = [dict_gene_idx[gene] for gene in test_genes_left]
            test_labels_left = [training_labels[idx] for idx in test_index]
            unknown_genes = []
            unknown_genes.extend(test_genes_left)
            for gene in self.all_genes:
                if gene not in training_genes:
                    unknown_genes.append(gene)
            unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes]
        
            for kernel_idx, kernel in enumerate(self.kernels):
                training_kernel = util.extract_submatrix(training_indices,training_indices,kernel)
                unknown_kernel = util.extract_submatrix(unknown_indices,training_indices,kernel)
                
                for c_idx, c in enumerate(list_c):                        
                    clf = SVC(C=c, kernel='precomputed')
                    clf.fit(training_kernel, training_labels_left)
                    
                    scores = clf.decision_function(unknown_kernel)
                    
                    qscores = []
                    
                    for s in scores[:len(test_indices)]:
                        qscore = float(sum([int(s >= value) for value in scores]))/len(scores)
                        qscores.append(qscore)
                    fpr, tpr, thresholds = metrics.roc_curve(test_labels_left, qscores, pos_label=1)
                    auc = metrics.auc(fpr, tpr)
                    
                    dict_paras_auc[(kernel_idx, c_idx)] += auc
                
        return max(dict_paras_auc, key=dict_paras_auc.get)        
コード例 #3
0
ファイル: evaluation.py プロジェクト: dinhinfotech/DiGI
    def validate_leave_one_out(self):

        list_c = [10e-4, 10e-3, 10e-2, 10e-1, 1, 10e+1, 10e+2, 10e+3, 10e+4]

        dict_gene_idx = {}
        for idx, gene in enumerate(self.all_genes):
            dict_gene_idx[gene] = idx

        dict_paras_auc = {}

        for kernel_idx in range(len(self.kernels)):
            for c_idx in range(len(list_c)):
                dict_paras_auc[(kernel_idx, c_idx)] = 0

        all_qscores = []
        for train_g_idx, train_g in enumerate(self.training_genes):
            print('processing gene ', train_g_idx)
            training_genes_left = self.training_genes[:]
            del training_genes_left[train_g_idx]

            training_indices = [dict_gene_idx[gene] for gene in training_genes_left]
            training_labels_left = self.training_labels[:]
            del training_labels_left[train_g_idx]

            unknown_genes = [train_g]
            for gene in self.all_genes:
                if gene not in self.training_genes:
                    unknown_genes.append(gene)
            unknown_indices = [dict_gene_idx[gene] for gene in unknown_genes]

            (kernel_idx, c_idx) = self.select_parameters(training_genes=training_genes_left,
                                                         training_labels=training_labels_left)

            training_kernel = util.extract_submatrix(training_indices, training_indices, self.kernels[kernel_idx])
            unknown_kernel = util.extract_submatrix(unknown_indices, training_indices, self.kernels[kernel_idx])

            clf = SVC(C=list_c[c_idx], kernel='precomputed')
            clf.fit(training_kernel, training_labels_left)

            scores = clf.decision_function(unknown_kernel)
            qscore = float(sum([int(scores[0] >= value) for value in scores])) / len(scores)
            all_qscores.append(qscore)

        fpr, tpr, thresholds = metrics.roc_curve(self.training_labels, all_qscores, pos_label=1)
        auc = metrics.auc(fpr, tpr)

        return auc
コード例 #4
0
def model_selection(list_kernels=None,
                    svm_paras=None,
                    list_labels=None,
                    n_folds=None):

    kf = cross_validation.KFold(len(list_labels), n_folds=n_folds)

    list_train_fold_index = []
    list_test_fold_index = []

    for train_index, test_index in kf:
        list_train_fold_index.append(train_index)
        list_test_fold_index.append(test_index)

    dict_paras = {}
    for kernel_idx in range(len(list_kernels)):
        for svm_idx in range(len(svm_paras)):
            dict_paras[(kernel_idx, svm_idx)] = 0

    for fold_idx in range(n_folds):
        train_list_index = list_train_fold_index[fold_idx]
        test_list_index = list_test_fold_index[fold_idx]

        labels_train = [labels[idx] for idx in train_list_index]
        labels_test = [labels[idx] for idx in test_list_index]

        for kernel_idx in range(len(list_kernels)):
            M_tr = util.extract_submatrix(train_list_index, train_list_index,
                                          list_kernels[kernel_idx])
            M_te = util.extract_submatrix(test_list_index, train_list_index,
                                          list_kernels[kernel_idx])

            for svm_idx, svm_para in enumerate(svm_paras):
                clf = svm.SVC(C=svm_para, kernel='precomputed')
                clf.fit(M_tr, labels_train)

                y_predict = clf.predict(M_te)
                acc = accuracy_score(labels_test, y_predict)
                dict_paras[(kernel_idx, svm_idx)] += acc

    return max(dict_paras, key=dict_paras.get)
コード例 #5
0
def create_blur_image(image, radius, weight):
    img, width, height = open_image(image)

    image_data = get_image_data(img)

    new_image_data = get_image_data(img)

    new_color = []
    for x in range(width):
        for y in range(height):
            image_data_submatrix, height_sub, width_sub = extract_submatrix(x, y, height, width,
                                                                            image_data.copy(), radius, weight)

            new_color = calculate_new_color(image_data_submatrix, weight, height_sub, width_sub)

            new_image_data[y][x] = new_color

    save_new_image(new_image_data, f'test-image-blur-radius-{radius}-weight-{weight}.png')
    print('Image successfully changed.')
コード例 #6
0
"""Cross validation"""

all_avg_accs = []
f = open(save_file, 'w')
for ran_idx in range(10):
    print "Random ", ran_idx

    shuffle_indices = range(len(pre_labels))
    random.shuffle(shuffle_indices)

    labels = [pre_labels[idx] for idx in shuffle_indices]
    list_kernels = []
    for kernel in pre_list_kernels:
        list_kernels.append(
            util.extract_submatrix(shuffle_indices, shuffle_indices, kernel))

    kf = cross_validation.KFold(len(labels), n_folds=n_folds)

    list_train_fold_index = []
    list_test_fold_index = []

    for train_index, test_index in kf:
        list_train_fold_index.append(train_index)
        list_test_fold_index.append(test_index)
    list_accs = []

    for fold_idx in range(n_folds):
        train_list_index = list_train_fold_index[fold_idx]
        test_list_index = list_test_fold_index[fold_idx]
コード例 #7
0
def evaluate(adjacency_path=None,
             node_label_folder=None,
             all_gene_path=None,
             train_gene_folder=None,
             train_label_folder=None,
             n_iters=None,
             n_hops=None,
             n_clusters=None,
             svm_paras=None,
             save_folder=None):

    all_genes = util.load_list_from_file(all_gene_path)
    number_svm_parameters = len(svm_paras)

    dict_gene_idx = {}
    for idx, gene in enumerate(all_genes):
        dict_gene_idx[gene] = idx

    graph = util.create_graph(adjacency_path=adjacency_path)

    for n_cluster in n_clusters:
        util.node_labeling(g=graph,
                           label_path=node_label_folder + str(n_cluster))
        for n_iter in n_iters:

            WLvect = WLVectorizer(r=n_iter)
            iters_features = WLvect.transform([graph])
            M = iters_features[0][0]
            for iter_id in range(1, n_iter + 1):
                M = M + iters_features[iter_id][0]
            print 'Done WL compuation'
            sys.stdout.flush()

            for n_hop in n_hops:
                print 'Begining DWL compuation'
                sys.stdout.flush()
                G = util.deepwl(graph=graph, feature_matrix=M, n_hop=n_hop)
                print "Size of G", G.shape

                print 'Done DWL compuation'
                sys.stdout.flush()

                for disease_idx in range(12):
                    list_training_genes = util.load_list_from_file(
                        train_gene_folder + str(disease_idx))
                    list_training_labels = util.load_list_from_file(
                        train_label_folder + str(disease_idx))
                    list_training_labels = [
                        int(e) for e in list_training_labels
                    ]
                    list_qscores = [[] for i in range(number_svm_parameters)]

                    for gene_idx, gene in enumerate(list_training_genes):
                        list_training_genes_del = list_training_genes[:]
                        del list_training_genes_del[gene_idx]
                        training_genes_idx = [
                            dict_gene_idx[g] for g in list_training_genes_del
                        ]

                        list_training_labels_del = list_training_labels[:]
                        del list_training_labels_del[gene_idx]

                        unknown_genes_idx = [dict_gene_idx[gene]]
                        for idx in range(len(all_genes)):
                            if (idx not in training_genes_idx) and (
                                    idx != dict_gene_idx[gene]):
                                unknown_genes_idx.append(idx)

                        Mtr = util.extract_submatrix(training_genes_idx,
                                                     training_genes_idx, G)
                        M_unknown = util.extract_submatrix(
                            unknown_genes_idx, training_genes_idx, G)

                        for idx_svm, svm_para in enumerate(svm_paras):
                            clf = svm.SVC(C=svm_para, kernel='precomputed')
                            clf.fit(Mtr, list_training_labels_del)
                            scores = clf.decision_function(M_unknown)
                            len_scores = len(scores)
                            qscore = float(
                                sum([int(scores[0] > val)
                                     for val in scores])) / len_scores
                            list_qscores[idx_svm].append(qscore)
                    # computing auc
                    save_lines = []
                    for qscores_idx, qscores in enumerate(list_qscores):
                        fpr, tpr, thresholds = metrics.roc_curve(
                            list_training_labels, qscores, pos_label=1)
                        auc = metrics.auc(fpr, tpr)

                        line = str(n_cluster) + "_" + str(n_iter) + "_" + str(
                            n_hop) + "_" + str(qscores_idx) + ":\t" + str(
                                auc) + "\n"
                        save_lines.append(line)

                    f = open(save_folder + str(disease_idx), 'w')
                    f.writelines(save_lines)
                    f.close()