def metrics_helper(human_scores, system_scores): """ This is a helper function that computes some basic metrics for the system_scores against the human_scores. """ # compute the kappas unweighted_kappa = kappa(human_scores, system_scores) quadratic_weighted_kappa = kappa(human_scores, round(system_scores), weights='quadratic') # compute the agreement statistics human_system_agreement = agreement(human_scores, system_scores) human_system_adjacent_agreement = agreement(human_scores, system_scores, tolerance=1) # compute the pearson correlation after removing # any cases where either of the scores are NaNs. df = pd.DataFrame({'human': human_scores, 'system': system_scores}).dropna(how='any') correlations = pearsonr(df['human'], df['system'])[0] # compute the min/max/mean/std. dev. for the system and human scores min_system_score = np.min(system_scores) min_human_score = np.min(human_scores) max_system_score = np.max(system_scores) max_human_score = np.max(human_scores) mean_system_score = np.mean(system_scores) mean_human_score = np.mean(human_scores) system_score_sd = np.std(system_scores, ddof=1) human_score_sd = np.std(human_scores, ddof=1) # compute standardized mean difference as recommended # by Williamson et al (2012) numerator = mean_system_score - mean_human_score denominator = np.sqrt((system_score_sd**2 + human_score_sd**2)/2) SMD = numerator/denominator # return everything as a series return pd.Series({'kappa': unweighted_kappa, 'wtkappa': quadratic_weighted_kappa, 'exact_agr': human_system_agreement, 'adj_agr': human_system_adjacent_agreement, 'SMD': SMD, 'corr': correlations, 'sys_min': min_system_score, 'sys_max': max_system_score, 'sys_mean': mean_system_score, 'sys_sd': system_score_sd, 'h_min': min_human_score, 'h_max': max_human_score, 'h_mean': mean_human_score, 'h_sd': human_score_sd, 'N': len(system_scores)})
def stats (list1,list2): print "Predictions:" print list1 print list(reversed(list2)) #COMPARABLE ORDER print list1fl=[class2float(i) for i in list1] list2fl=[class2float(i) for i in list(reversed(list2))] print list1fl print list2fl print print kappa(list1fl,list2fl) #http://skll.readthedocs.org/en/latest/_modules/skll/metrics.html print print list2
def train_model(train, folds): y = train.median_relevance.values x = train.drop(["median_relevance", "doc_id"], 1).values clf = Pipeline([ ('scl', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svm', SVC(C=10.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, shrinking=True, probability=False, tol=0.001, cache_size=200, class_weight=None, verbose=False, max_iter=-1, random_state=None)) ]) scores = [] for train_index, test_index in cross_validation.StratifiedKFold( y=y, n_folds=int(folds), shuffle=True, random_state=42): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(x_train, y_train) predicted = transform_regression(clf.predict(x_test)) s = kappa(y_test, predicted, weights="quadratic") print s scores.append(s) warn("cv scores:") warn(scores) warn(np.mean(scores)) warn(np.std(scores)) clf.fit(x, y) return clf
def agreementtest(path1,path2): #1. import the labels from utils import loadLabels label_human = loadLabels(path1,0,2) label_machine = loadLabels(path2,0,2) #2. transfer them into the list y = [] y_pred = [] for key in label_human: y += [label_human[key]] y_pred += [label_machine[key]] print len(y),len(y_pred) #3. get the raw agreement from pandas import DataFrame from pandas import crosstab result = DataFrame({'y_pred' : y_pred, 'y_human' : y}) crosstable = crosstab(result['y_pred'], result['y_human']) print crosstable acc = float(crosstable['1']['1']+crosstable['0']['0'])/len(y_pred) prec = float(crosstable['1']['1'])/(crosstable['1']['1']+crosstable['0']['1']) recall = float(crosstable['1']['1'])/(crosstable['1']['1']+crosstable['1']['0']) F1_hand = 2 * prec * recall/( prec + recall) #4. use the skll to get the kappa from skll import metrics kappa = metrics.kappa(y,y_pred) return crosstable,acc,recall,prec,F1_hand,kappa
def testing(file): """ To test and see if the quadratic weighing kappa function is working properly """ f = open(file, 'r') f.readline() labels, estimate = [], [] for row in f: label = row.strip().split("\t")[6] if random() > 0.5: estimate.append(int(4*int(label)*random())) else: estimate.append(int(int(label)*random())) labels.append(int(label)) print kappa(labels, labels, weights = 'quadratic')
def runSVM(self, y_test, y_train, x_train, x_test): clf = svm.LinearSVC(class_weight="auto") clf.fit(x_train, y_train) direction = clf.coef_.tolist()[0] y_pred = clf.predict(x_test) y_pred = y_pred.tolist() kappa_score = kappa(y_test, y_pred) return kappa_score, direction
def print_kappa(self, method, one_off=False): mean_kappa_same = [] mean_kappa_diff = [] for i in range(0,50): checked_pairs = [] checked_pairs_same = [] checked_pairs_diff = [] kappas_same = [] kappas_diff = [] # calculating agreement for pairs from the same batches and different batches while len(checked_pairs_same) < 20 or len(checked_pairs_diff) < 20: id1 = random.choice(self.ids) id2 = random.choice(self.ids) pair = sorted([id1, id2]) if pair not in checked_pairs and id1 != id2: values_first = self.get_rating_values(id1) values_second = self.get_rating_values(id2) if len(values_first) != len(values_second) or len(values_first) == 0: continue if method == 'standard': kappa = metrics.kappa(values_first, values_second) else: kappa = metrics.kappa(values_first, values_second, method, one_off) if self.batch_hash[id1] == self.batch_hash[id2]: kappas_same.append(kappa) checked_pairs_same.append(pair) else: kappas_diff.append(kappa) checked_pairs_diff.append(pair) checked_pairs.append(pair) mean_kappa_same.append(numpy.mean(kappas_same)) mean_kappa_diff.append(numpy.mean(kappas_diff)) print("Kappa same group: " + str(numpy.mean(mean_kappa_same)) + " different groups: " + str(numpy.mean(mean_kappa_diff))) print("Confidence same: " + str(stats.norm.interval(0.999, loc=numpy.mean(mean_kappa_same), scale=numpy.std(mean_kappa_same)/math.sqrt(50))) + " different: " + str(stats.norm.interval(0.999, loc=numpy.mean(mean_kappa_diff), scale=numpy.std(mean_kappa_diff)/math.sqrt(50))))
def eval(self): sys.stderr.write('Evaluating\n') folds = StratifiedKFold(y=self.y_train, n_folds=self.folds, shuffle=True, random_state=1337) scores = [] for train_index, test_index in folds: self.fit(train_index) predicted, y_test = self.predict(test_index) k = kappa(y_test, transform(predicted), weights='quadratic') print(k) scores.append(k) print(scores) print(np.mean(scores)) print(np.std(scores))
def evalerror(preds, dtrain): labels = dtrain.get_label() # TODO: delete # print 'evalerror' # print max(preds) preds = np.round(preds, 0) # print max(preds) # print len(preds), preds # return a pair metric_name, result # since preds are margin(before logistic transformation, cutoff at 0) return 'kappa', 1.0 - kappa(labels, preds, weights='quadratic')
def kNNClass(train_idx,test_idx,n_neighbors): training_data=input_kmers_counts.loc[train_idx] testing_data=input_kmers_counts.loc[test_idx] clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform") clf.fit(training_data[kmer_colums], training_data["class"]) #print "predicting" predicted_classes= clf.predict(testing_data[kmer_colums]) # compute kappa stat confusion_matrix(testing_data["class"],predicted_classes) # make a mapping class_map=dict(zip(set(testing_data["class"]),range(0,4))) kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes]) cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector(testing_data["class"])) return kapp,cm
def get_average_kappa(arr_act, arr_pred): """ Calculates the average quadratic kappa over the entire essay set """ assert(len(arr_act) == len(arr_pred)) total = len(arr_act) kappa_val = 0 for i in xrange(0, total): kappa_val += met.kappa([arr_act[i]], [arr_pred[i]], \ 'quadratic') # print arr_act[i], '-', arr_pred[i] kappa_val = float(kappa_val) / float(total) return kappa_val
def kNNClass(train_idx,test_idx,n_neighbors,k_mer_subset): logger.info('computing for %s'%(k_mer_subset)) train_idx=train_idx test_idx=test_idx training_subset=normalized_counts.loc[train_idx][np.append(k_mer_subset,"class")] testing_subset=normalized_counts.loc[test_idx][np.append(k_mer_subset,"class")] clf = neighbors.KNeighborsClassifier(n_neighbors, weights="uniform") clf.fit(training_subset[k_mer_subset], training_subset["class"]) #print "predicting" predicted_classes= clf.predict(testing_data[k_mer_subset]) # compute kappa stat confusion_matrix(testing_data["class"],predicted_classes) # make a mapping class_map=dict(zip(set(testing_data["class"]),range(0,4))) kapp=kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes]) cm=caret.confusionMatrix(robjects.FactorVector(predicted_classes),robjects.FactorVector( testing_data["class"])) logger.info("Finished for %s with kappa==%f"%(k_mer_subset,kapp)) return kapp,cm
def accuracy_stats(Ypred, Ytest): stats = {} statkeys = ['AA', 'AP', 'f1', 'recall', 'kappa'] for key in statkeys: stats[key] = [] for ypred, ytest in zip(Ypred, Ytest): stats['AA'].append(accuracy_score(ytest.ravel(), ypred.ravel())) stats['AP'].append(precision_score(ytest.ravel(), ypred.ravel())) stats['f1'].append(f1_score(ytest.ravel(), ypred.ravel())) stats['recall'].append(recall_score(ytest.ravel(), ypred.ravel())) stats['kappa'].append(kappa(ytest.ravel(), ypred.ravel())) return stats
def scores(X,y,y_proba,name="nan",to_plot=False): # print(name+' Classifier:\n {}'.format(metrics.classification_report(X,y))) cm= metrics.confusion_matrix(X,y) print cm if(to_plot): plt_cm(X,y,[-1,1]) auc_compute(X,y) auc=roc_auc_score(X,y_proba) print(name+' Classifier auc: %f' % auc) accuracy=metrics.accuracy_score(X,y) print(name+' Classifier accuracy: %f' % (accuracy)) f1=metrics.f1_score(X,y,pos_label=1) print(name+' Classifier f1: %f' % (f1)) precision=metrics.precision_score(X,y) print(name+' Classifier precision_score: %f' % (precision)) recall=metrics.recall_score(X,y) print(name+' Classifier recall_score: %f' % (recall)) kappa_score=kappa(X,y) print(name+' Classifier kappa_score:%f' % (kappa_score)) return [auc,f1.mean(),accuracy.mean(),precision.mean(),recall.mean(),kappa_score]
def train_model(train, folds): y = train.median_relevance.values x = train.drop(["median_relevance", "doc_id"], 1).values xg_params = { "silent": 1, "objective": "reg:linear", "nthread": 4, "bst:max_depth": 10, "bst:eta": 0.1, "bst:subsample": 0.5 } num_round = 600 scores = [] for train_index, test_index in cross_validation.StratifiedKFold( y=y, n_folds=int(folds), shuffle=True, random_state=42): x_train, x_test = x[train_index], x[test_index] y_train, y_test = y[train_index], y[test_index] xg_train = xg.DMatrix(x_train, label=y_train) xg_test = xg.DMatrix(x_test, label=y_test) watchlist = [(xg_train, "train"), (xg_test, "test")] bst = xg.train(xg_params, xg_train, num_round, watchlist, feval=evalerror) predicted = transform_regression(bst.predict(xg_test)) s = kappa(y_test, predicted, weights="quadratic") print s scores.append(s) warn("cv scores:") warn(scores) warn(np.mean(scores)) warn(np.std(scores))
def run(self): """ run Forrest """ for loopcount in range(self.ntasks): seed = time.time() resultsline = [] # do stuff training = random.sample(range(self.matrix.shape[0]), int(self.trainingratio*self.matrix.shape[0])) testing = list(set(range(self.matrix.shape[0])).difference(training)) print self.matrix[training,:].shape, len([targ[i] for i in training]), self.matrix[testing,:].shape clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance") clf.fit(self.matrix[training,:].todense(), [targ[i] for i in training]) #~ clf.fit(self.matrix[training,:], [targ[i] for i in training]) classes=clf.predict(self.matrix[testing,:].todense()) #~ classes=clf.predict(self.matrix[testing,:]) # print(confusion_matrix(classes,[targ[i] for i in testing])) # print(kappa(classes,[targ[i] for i in testing])) resultsline = [] resultsline = resultsline+info_log resultsline.append(seed) resultsline.append(kappa(classes,[targ[i] for i in testing])) conf = [] for row in confusion_matrix(classes,[targ[i] for i in testing]): conf = conf+list(row) resultsline = resultsline+conf self.result_queue.put([str(item) for item in resultsline]) # store the results in the results queue once all the contigs are processed # please run please run please run please run please run please run # sys.stdout.write("Done with worker for %d tasks: %d loops done\n" % (self.ntasks, loopcount+1))
line = line.split() if str(g) in line[1]: batches[int(line[0])].append(line) print "### gamma=%d ###"%g moy = 0 for classifier in batches: classes = [] predicted = [] for contig in classifier: # print contig[1], contig[3], contig[4] classes.append(int(contig[3])) predicted.append(int(contig[4])) kappas.append(kappa(classes,predicted)) print "tous classifieurs:" print "kappa moyen = %f" % np.mean(kappas) print "écart-type = %f" % np.std(kappas) best_batchid = kappas.index(max(kappas)) print "meilleur kappa = %f" % kappas[best_batchid] print "loading best classifier..." with open(clffile, "r") as clffh: bestclf = pickle.load(clffh)[best_batchid] # Re-run the prediction with the best classifier print "loading data..."
def test(self, samples, test_labels,label_names=None): # test each using held-out data test = samples # if test_labels is None: # return self.predict(test_samples) label_test = test_labels print("\nTesting...") print "Test Samples:", len(test) classes = [] p_count = 0 avg_class_err = [] avg_err = self.test_network(test, label_test) predictions = self.predict_network(test) for i in range(0, len(label_test)): p_count += 1 classes.append(label_test[i].tolist()) predictions = np.round(predictions, 3).tolist() actual = [] pred = [] cor = [] # get the percent correct for the predictions # how often the prediction is right when it is made for i in range(0, len(predictions)): c = classes[i].index(max(classes[i])) actual.append(c) p = predictions[i].index(max(predictions[i])) pred.append(p) cor.append(int(c == p)) # calculate a naive unfair baseline using averages avg_class_pred = np.mean(label_test, 0) print "Predicting:", avg_class_pred, "for baseline*" for i in range(0, len(label_test)): res = FFNNet.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i])) avg_class_err.append(res) # res = RNN_GRU.AverageCrossEntropy(np.array(predictions_GRU[i]), np.array(classes[i])) # avg_err_GRU.append(res) print "*This is calculated from the TEST labels" from sklearn.metrics import roc_auc_score, f1_score from skll.metrics import kappa kpa = [] auc = [] f1s = [] t_pred = du.transpose(predictions) t_lab = du.transpose(label_test) for i in range(0, len(t_lab)): # if i == 0 or i == 3: # t_pred[i] = du.normalize(t_pred[i],method='max') kpa.append(kappa(t_lab[i], t_pred[i])) auc.append(roc_auc_score(t_lab[i], t_pred[i])) temp_p = [round(j) for j in t_pred[i]] if np.nanmax(temp_p) == 0: f1s.append(0) else: f1s.append(f1_score(t_lab[i], temp_p)) print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_class_err)) print "\nNetwork Performance:" print "Average Cross-Entropy:", "{0:.4f}".format(np.nanmean(avg_err)) print "AUC:", "{0:.4f}".format(np.nanmean(auc)) print "Kappa:", "{0:.4f}".format(np.nanmean(kpa)) print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s)) print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor) * 100) print "\n{:<15}".format(" Label"), \ "{:<9}".format(" AUC"), \ "{:<9}".format(" Kappa"), \ "{:<9}".format(" F Stat"), \ "\n==============================================" if label_names is None or len(label_names) != len(t_lab): label_names = [] for i in range(0, len(t_lab)): label_names.append("Label " + str(i + 1)) for i in range(0, len(t_lab)): print "{:<15}".format(label_names[i]), \ "{:<9}".format(" {0:.4f}".format(auc[i])), \ "{:<9}".format(" {0:.4f}".format(kpa[i])), \ "{:<9}".format(" {0:.4f}".format(f1s[i])) print "\n==============================================" actual = [] predicted = [] for i in range(0, len(predictions)): actual.append(label_test[i].tolist().index(max(label_test[i]))) predicted.append(predictions[i].index(max(predictions[i]))) from sklearn.metrics import confusion_matrix print confusion_matrix(actual, predicted) return predictions
input_kmers_counts["class"]=all_classes input_kmers_counts["species"]=all_species normalized_counts["class"]=all_classes normalized_counts["species"]=all_species # PCA normalized_counts[kmer_colums]=scale(normalized_counts[kmer_colums]) normalized_counts[kmer_colums].apply(scipy.mean,0) normalized_counts[kmer_colums].apply(scipy.std,0) non_zero=(normalized_counts[kmer_colums]!=0).apply(scipy.sum,0) too_abundant_kmers=list(non_zero.order()[-10:].index) kmer_colums_filt=list(set(kmer_colums).difference(too_abundant_kmers)) pca_trans=PCA(n_components=160) pca_fitted=pca_trans.fit(normalized_counts[kmer_colums]) pca_coord=pca_fitted.transform(normalized_counts[kmer_colums]) # SVM classification and kappa estimates X_train,X_test,Y_train,Y_test=train_test_split(pca_coord,normalized_counts["class"],test_size=0.5,random_state=421) clf=SVC(C=4.152687927300392,gamma=0.002448996894369464,kernel='rbf') clf.fit(X_train,Y_train) predictions=clf.predict(X_test) mmat=confusion_matrix(predictions,Y_test) print mmat class_map=dict(zip(set(input_kmers_counts["class"]),range(0,4))) kappa([class_map[x] for x in Y_test],[class_map[x] for x in predictions])
def test_invalid_weighted_kappa(): kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=False) kappa([1, 2, 1], [1, 2, 1], weights='invalid', allow_off_by_one=True)
def test_invalid_lists_kappa(): kappa(['a', 'b', 'c'], ['a', 'b', 'c'])
def quadratic_kappa(true, predicted): if params.REGRESSION: return kappa(true, predicted, weights='quadratic') else: return kappa(true, np.argmax(predicted, axis=1), weights='quadratic')
def check_kappa(y_true, y_pred, weights, allow_off_by_one, expected): assert_almost_equal(kappa(y_true, y_pred, weights=weights, allow_off_by_one=allow_off_by_one), expected)
predictions = np.round(predictions) print min(predictions) print max(predictions) # Get error and best iteration # TODO: delete # print 'predictions' # print max(predictions) # predictions = np.round(predictions, 0) # print max(predictions) # print min(predictions) # print len(predictions), predictions kappa_score = kappa(test_Y, predictions, weights='quadratic') print "Kappa: %f" % kappa_score print "Confusion matrix:" print confusion_matrix(test_Y, predictions) print "Classification report:" print classification_report(test_Y, predictions) errors.append(kappa_score) best_iterations.append(bst.best_iteration) # Append new grid error grid_errors.append(np.mean(errors)) grid_best_iterations.append(list(best_iterations)) # Show results for i in xrange(len(params_space)):
# train_test_split(x_all, y_all, train_size=0.8) # # # Обучаем модели # for m in models.keys(): # models[m]['cl'].fit(x_train, y_train) # # # Прогнозируем # for t in models.keys(): # models[t]['pred'] = models[t]['cl'].predict(x_test) # # tmp = metrics.kappa(y_test, models['rf']['pred']) # print(tmp) # kappa.append(tmp) # print(sum(kappa) / len(kappa)) tmp = metrics.kappa(y_test, models['rf']['pred']) print(tmp) # quit() # Визуализируем models_num = len(models) fig, axes = plt.subplots(nrows=2, ncols=models_num, squeeze=False) if True: # Строим confusion матрицы for (name, cm), ax in zip([(x['title'], x['cm']) for x in models.values()], axes.flat[:models_num]): m = ax.matshow(cm, cmap='Oranges') ax.set_title(name)
def quadratic_kappa(true, predicted): return kappa(true, predicted, weights='quadratic')
training_ratio = 0.8 training_set = indices[0:int(n_rows * training_ratio)] testing_set = indices[-int(n_rows * training_ratio):] training_data = input_kmers_counts.loc[training_set] testing_data = input_kmers_counts.loc[testing_set] clf = neighbors.KNeighborsClassifier(15, weights="uniform") clf.fit(training_data[count_colums], training_data["class"]) #print "predicting" predicted_classes = clf.predict(testing_data[count_colums]) # compute kappa stat confusion_matrix(testing_data["class"], predicted_classes) # make a mapping class_map = dict(zip(set(testing_data["class"]), range(0, 4))) kappa([class_map[x] for x in testing_data["class"]], [class_map[x] for x in predicted_classes]) # fit a KNN on the normalized_counts # kNNClass(training_set,testing_set,15,count_colums) # We focus on the ambiguous k-mers, approx 15k; basically all-kmer appear more than once ambiguous_kmers = all_nodes_df[all_nodes_df["degree"] > 2] len(set(ambiguous_kmers['kmer'])) len(set(all_nodes_df['kmer'])) # We do a PCA on that amb_kmers_counts = pandas.pivot_table(ambiguous_kmers, values="degree", index=['sequence_description'], columns=["kmer"],
X = iris.data[:, :2] # we only take the first two features. We could # avoid this ugly slicing by using a two-dim dataset y = iris.target training = random.sample(range(150), 100) testing = list(set(range(150)).difference(training)) X_train = iris.data[training,:] y_train = [y[i] for i in training] X_test = iris.data[testing,:] y_test = [y[i] for i in testing] clf3 = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf3.fit(X_train, y_train) print "kappa =", kappa(y_test,clf3.predict(X_test)) metric = LMNN(X_train, y_train) metric.fit() new_X_train = metric.transform() new_X_test = metric.transform(X_test) clf4 = neighbors.KNeighborsClassifier(n_neighbors, weights=weights) clf4.fit(new_X_train, y_train) print "kappa =", kappa(y_test,clf4.predict(new_X_test)) # ## Create color maps #cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) #cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF'])
def test(self, test, test_labels=None, label_names=None): if test_labels is None: return self.predict(test) test_cpy = list(test) if not du.len_deepest(test_cpy) == self.num_input: if self.covariates is not None: for a in range(0, len(test_cpy)): if type(test_cpy[a]) is not list: test_cpy[a] = test_cpy[a].tolist() for e in range(0, len(test_cpy[a])): c = [] for i in range(0, len(self.covariates)): c.append(test_cpy[a][e][self.covariates[i]]) test_cpy[a][e] = c if len(self.cov_mean) == 0 or len(self.cov_stdev) == 0: print "Scaling factors have not been generated: calculating using test sample" t_tr = du.transpose(RNN.flatten_sequence(test_cpy)) self.cov_mean = [] self.cov_stdev = [] for a in range(0, len(t_tr)): mn = np.nanmean(t_tr[a]) sd = np.nanstd(t_tr[a]) self.cov_mean.append(mn) self.cov_stdev.append(sd) test_samples = [] import math for a in range(0, len(test_cpy)): sample = [] for e in range(0, len(test_cpy[a])): covariates = [] for i in range(0, len(test_cpy[a][e])): cov = 0 if self.cov_stdev[i] == 0: cov = 0 else: cov = (test_cpy[a][e][i] - self.cov_mean[i]) / self.cov_stdev[i] if math.isnan(cov) or math.isinf(cov): cov = 0 covariates.append(cov) sample.append(covariates) test_samples.append(sample) label_test = test_labels print("\nTesting...") print "Test Samples:", len(test_samples) classes = [] p_count = 0 avg_class_err = [] avg_err_RNN = [] if self.scale_output: print "Scaling output..." predictions_RNN = [] for i in range(0, len(test_samples)): # get the prediction and calculate cost prediction_RNN = self.pred_RNN([test_samples[i]]) #prediction_RNN += .5-self.avg_preds if self.scale_output: prediction_RNN -= self.min_preds prediction_RNN /= (self.max_preds - self.min_preds) prediction_RNN = np.clip(prediction_RNN, 0, 1) prediction_RNN = [(x * [ 1 if c == self.majorityclass else 0.9999 for c in range(0, self.num_output) ]) if np.sum(x) == 4 else x for x in prediction_RNN] avg_err_RNN.append( self.compute_cost_RNN([test_samples[i]], label_test[i])) for j in range(0, len(label_test[i])): p_count += 1 classes.append(label_test[i][j].tolist()) predictions_RNN.append(prediction_RNN[j].tolist()) predictions_RNN = np.round(predictions_RNN, 3).tolist() actual = [] pred_RNN = [] cor_RNN = [] # get the percent correct for the predictions # how often the prediction is right when it is made for i in range(0, len(predictions_RNN)): c = classes[i].index(max(classes[i])) actual.append(c) p_RNN = predictions_RNN[i].index(max(predictions_RNN[i])) pred_RNN.append(p_RNN) cor_RNN.append(int(c == p_RNN)) # calculate a naive baseline using averages flattened_label = [] for i in range(0, len(label_test)): for j in range(0, len(label_test[i])): flattened_label.append(label_test[i][j]) flattened_label = np.array(flattened_label) avg_class_pred = np.mean(flattened_label, 0) print "Predicting:", avg_class_pred, "for baseline*" for i in range(0, len(flattened_label)): res = RNN.AverageCrossEntropy(np.array(avg_class_pred), np.array(classes[i])) avg_class_err.append(res) # res = RNN.AverageCrossEntropy(np.array(predictions_RNN[i]), np.array(classes[i])) # avg_err_RNN.append(res) print "*This is calculated from the TEST labels" from sklearn.metrics import roc_auc_score, f1_score from skll.metrics import kappa kpa = [] auc = [] f1s = [] apr = [] t_pred = du.transpose(predictions_RNN) t_lab = du.transpose(flattened_label) for i in range(0, len(t_lab)): #if i == 0 or i == 3: # t_pred[i] = du.normalize(t_pred[i],method='max') temp_p = [round(j) for j in t_pred[i]] kpa.append(kappa(t_lab[i], t_pred[i])) apr.append(du.Aprime(t_lab[i], t_pred[i])) auc.append(roc_auc_score(t_lab[i], t_pred[i])) if np.nanmax(temp_p) == 0: f1s.append(0) else: f1s.append(f1_score(t_lab[i], temp_p)) if label_names is None or len(label_names) != len(t_lab): label_names = [] for i in range(0, len(t_lab)): label_names.append("Label " + str(i + 1)) RNN.print_label_distribution(label_test, label_names) self.eval_metrics = [ np.nanmean(avg_err_RNN), np.nanmean(auc), np.nanmean(kpa), np.nanmean(f1s), np.nanmean(cor_RNN) * 100 ] print "\nBaseline Average Cross-Entropy:", "{0:.4f}".format( np.nanmean(avg_class_err)) print "\nNetwork Performance:" print "Average Cross-Entropy:", "{0:.4f}".format( np.nanmean(avg_err_RNN)) print "AUC:", "{0:.4f}".format(np.nanmean(auc)) print "A':", "{0:.4f}".format(np.nanmean(apr)) print "Kappa:", "{0:.4f}".format(np.nanmean(kpa)) print "F1 Score:", "{0:.4f}".format(np.nanmean(f1s)) print "Percent Correct:", "{0:.2f}%".format(np.nanmean(cor_RNN) * 100) print "\n{:<15}".format(" Label"), \ "{:<9}".format(" AUC"), \ "{:<9}".format(" A'"), \ "{:<9}".format(" Kappa"), \ "{:<9}".format(" F Stat"), \ "\n==============================================" for i in range(0, len(t_lab)): print "{:<15}".format(label_names[i]), \ "{:<9}".format(" {0:.4f}".format(auc[i])), \ "{:<9}".format(" {0:.4f}".format(apr[i])), \ "{:<9}".format(" {0:.4f}".format(kpa[i])), \ "{:<9}".format(" {0:.4f}".format(f1s[i])) print "\n==============================================" print "Confusion Matrix:" actual = [] predicted = [] flattened_label = flattened_label.tolist() for i in range(0, len(predictions_RNN)): actual.append(flattened_label[i].index(max(flattened_label[i]))) predicted.append(predictions_RNN[i].index(max(predictions_RNN[i]))) from sklearn.metrics import confusion_matrix conf_mat = confusion_matrix(actual, predicted) for cm in conf_mat: cm_row = "\t" for element in cm: cm_row += "{:<6}".format(element) print cm_row print "\n==============================================" return predictions_RNN
def check_kappa(y_true, y_pred, weights, allow_off_by_one, expected): assert_almost_equal( kappa(y_true, y_pred, weights=weights, allow_off_by_one=allow_off_by_one), expected)
training_set=indices[0:int(n_rows*training_ratio)] testing_set=indices[-int(n_rows*training_ratio):] training_data=input_kmers_counts.loc[training_set] testing_data=input_kmers_counts.loc[testing_set] clf = neighbors.KNeighborsClassifier(15, weights="uniform") clf.fit(training_data[count_colums], training_data["class"]) #print "predicting" predicted_classes= clf.predict(testing_data[count_colums]) # compute kappa stat confusion_matrix(testing_data["class"],predicted_classes) # make a mapping class_map=dict(zip(set(testing_data["class"]),range(0,4))) kappa([class_map[x] for x in testing_data["class"]],[class_map[x] for x in predicted_classes]) # fit a KNN on the normalized_counts # kNNClass(training_set,testing_set,15,count_colums) # We focus on the ambiguous k-mers, approx 15k; basically all-kmer appear more than once ambiguous_kmers=all_nodes_df[all_nodes_df["degree"]>2] len(set(ambiguous_kmers['kmer'])) len(set(all_nodes_df['kmer'])) # We do a PCA on that amb_kmers_counts=pandas.pivot_table(ambiguous_kmers,values="degree",index=['sequence_description'],columns=["kmer"],fill_value=0) kmer_colums=amb_kmers_counts.columns
documents, classes, train_size=0.7) classifier = NaiveBayesTextClassifier( categories=categories, min_df=1, lowercase=True, stop_words=stopwords.words('english') ) print('> Train classifier') classifier.train(train_docs, train_classes) print('> Classify test data...') predicted_classes = classifier.classify(test_docs) print('> Complete.') print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>4} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes)) ) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa( category_to_number(test_classes, categories), category_to_number(predicted_classes, categories) ) )) print('-' * 42)
# -------------- Classify --------------- # print("> Start classify data") start_time = time.time() if options.test: predicted_classes = classifier.classify(test_docs) print(classification_report(test_classes, predicted_classes)) print('-' * 42) print("{:<25}: {:>6} articles".format("Test data size", len(test_classes))) print("{:<25}: {:>6.2f} %".format( "Accuracy", 100 * accuracy_score(test_classes, predicted_classes))) print("{:<25}: {:>6.2f} %".format( "Kappa statistics", 100 * kappa(test_classes, predicted_classes))) elif options.predict: predicted_classes = classifier.classify(test_data.review) print("> Save predicted results") print("> {}".format(PREDICTED_DATA_FILE)) np.savetxt(PREDICTED_DATA_FILE, np.concatenate( (test_data.values[:, 0:1], np.matrix(predicted_classes).T), axis=1), delimiter=',', header='id,sentiment', comments='', fmt="%s") print('-' * 42)
#clf.fit(training_matrix, training_targets) #print "predicting" #classes=clf.predict(testing_matrix) # #print(confusion_matrix(classes,testing_targets)) # #print(kappa(classes,testing_targets)) #print ("### avec norm") for i in range(len(length)): matrix[i,:]=matrix[i,:]/length[i] training_matrix = matrix[training,:] training_targets = [targ[i] for i in training] testing_matrix = matrix[testing,:] testing_targets = [targ[i] for i in testing] #print "fitting" clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance") clf.fit(training_matrix, training_targets) #print "predicting" classes=clf.predict(testing_matrix) print(confusion_matrix(classes,testing_targets)) print(kappa(classes,testing_targets))
#clf = neighbors.KNeighborsClassifier(n_neighbors, weights="distance") #clf.fit(training_matrix, training_targets) #print "predicting" #classes=clf.predict(testing_matrix) # #print(confusion_matrix(classes,testing_targets)) # #print(kappa(classes,testing_targets)) #print ("### avec norm") for i in range(len(length)): matrix[i,:]=matrix[i,:]/length[i] training_matrix = matrix[training,:] training_targets = [targ[i] for i in training] testing_matrix = matrix[testing,:] testing_targets = [targ[i] for i in testing] #print "fitting" rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=1.0).fit(training_matrix, training_targets) #print "predicting" classes=rbf_svc.predict(testing_matrix) print(confusion_matrix(classes,testing_targets)) print(kappa(classes,testing_targets))
csvfile = "nbayes-k6_arch_bact_euk_virus_3.csv" for n in nvals: print "### n=%s ###"%str(n) batches = [] for i in range(50): batches.append([]) c15 = 0 with open(csvfile, "r") as fh: for line in fh: line = line.split() if str(n) in line[1]: batches[int(line[0])].append(line) if int(line[3]) == int(line[4]): c15 += 1 moy15 = 0 for classifier in batches: classes = [] predicted = [] for contig in classifier: # print contig[1], contig[3], contig[4] classes.append(int(contig[3])) predicted.append(int(contig[4])) print "kappa =", kappa(classes,predicted) moy15 += kappa(classes,predicted) # print c15, "/", len(batches) # print confusion_matrix(classes,predicted) print "moy = ", moy15/50
features_test = np.delete(features_test, 0, 1) features_test = np.delete(features_test, len(features_test[1]) - 1, 1) # print features_train.shape, dummy_train.shape # print features_test.shape, dummy_test.shape combined_train = np.column_stack((features_train, dummy_train)) combined_test = np.column_stack((features_test, dummy_test)) # "Essay Set Classifier Feature Set Accuracy " # print "Set " + str(no) for i, clf in enumerate(classifiers): clf.fit(dummy_train, labels) prediction = clf.predict(dummy_train) prediction1 = clf.predict(dummy_test) a[i].append(kappa(test_labels, prediction1, weights='quadratic')) a[i].append(kappa(labels, prediction, weights='quadratic')) print no, "\tTest\t Stat\t ", a[0][0], '\t', a[0][1] #print no,"\tTrain\t Stat\t ",a[1][0],'\t',a[1][1] for i, clf in enumerate(classifiers): clf.fit(features_train, labels) prediction = clf.predict(features_train) prediction1 = clf.predict(features_test) a[i].append(kappa(test_labels, prediction1, weights='quadratic')) a[i].append(kappa(labels, prediction, weights='quadratic')) print no, "\tTest\t Prompt\t ", a[0][2], '\t', a[0][3] #print no,"\tTrain\t Prompt\t ",a[1][2],'\t',a[1][3] for i, clf in enumerate(classifiers): clf.fit(combined_train, labels)