def svmClassify(X_train, y_train, X_test, y_test, iteration): print("******************* SVM classification *********************\n") svm_model = svm.SVC(C=1, gamma=0.1) start_train_svm = time.time() svm_model.fit(X_train, y_train) end_train_svm = time.time() training_time_svm = end_train_svm - start_train_svm print("Training SVM model_selection %d took %.5f\n" % (iteration, training_time_svm)) predict_train_svm = svm_model.predict(X_train) print("training accuracy") print(accuracy_score(y_train, predict_train_svm)) print("\n") start_test_svm = time.time() predict_test_svm = svm_model.predict(X_test) end_test_svm = time.time() testing_time_svm = end_test_svm - start_test_svm print("Testing SVM model_selection %d took %.5f\n" % (iteration, testing_time_svm)) print("testing accuracy") print(accuracy_score(y_test, predict_test_svm)) print("\n") return training_time_svm, testing_time_svm
def gaussianProcess(X_train, y_train, X_test, y_test, iteration): print("************ Gaussian Process Classification **************\n") gp_rbf_fix = GaussianProcessClassifier(kernel=76.5**2 * RBF(length_scale=179), optimizer=None) start_train_gp = time.time() gp_rbf_fix.fit(X_train, y_train) end_train_gp = time.time() training_time_gp = end_train_gp - start_train_gp print("Training GP model_selection %d took %.5f\n" % (iteration, training_time_gp)) predict_train_gp = gp_rbf_fix.predict(X_train) print("training accuracy") print(accuracy_score(y_train, predict_train_gp)) print("\n") start_test_gp = time.time() predict_test_gp = gp_rbf_fix.predict(X_test) end_test_gp = time.time() testing_time_gp = end_test_gp - start_test_gp print("Testing GP model_selection %d took %.5f\n" % (iteration, training_time_gp)) print("testing accuracy") print(accuracy_score(y_test, predict_test_gp)) print("\n") return training_time_gp, testing_time_gp
def main(): samplesIMG, labels = prepare_samples() (trainSamples, testSamples, trainLabels, testLabels) = train_test_split(samplesIMG, labels, test_size=0.25, random_state=42) print("TREE") testResults = tree(trainSamples, trainLabels, testSamples) accTree = accuracy_score(testLabels.argmax(axis=1), testResults.argmax(axis=1)) print("FLAT") testResults = flat_network(trainSamples, trainLabels, testSamples) accFlat = accuracy_score(testLabels.argmax(axis=1), testResults.argmax(axis=1)) print("CNN") testResults = cnn_network(trainSamples, trainLabels, testSamples) accCnn = accuracy_score(testLabels.argmax(axis=1), testResults.argmax(axis=1)) print("Accuracy TREE: {}".format(accTree)) print("Accuracy FLAT: {}".format(accFlat)) print("Accuracy CNN: {}".format(accCnn)) plot_accuracy((accTree, accFlat, accCnn))
def build_metrics(y_train, p_train, y_test, p_test, dataset): metrics = {} if y_train is not None and p_train is not None: y_train_argmax = y_train if dataset.binary else np.argmax(y_train, axis=1) p_train_argmax = p_train.round() if dataset.binary else np.argmax( p_train, axis=1) metrics['final_train_accuracy'] = accuracy_score( p_train_argmax, y_train_argmax) else: logging.getLogger(__name__).warning( "No training data available during report generation.") if y_test is not None and p_test is not None: y_test_argmax = y_test if dataset.binary else np.argmax(y_test, axis=1) p_test_argmax = p_test.round() if dataset.binary else np.argmax(p_test, axis=1) metrics['accuracy'] = accuracy_score(y_test_argmax, p_test_argmax) metrics['confusion_matrix'] = confusion_matrix(y_test_argmax, p_test_argmax) metrics['classes'] = [] p, r, f1, s = precision_recall_fscore_support(y_test_argmax, p_test_argmax, average=None) for (i, l) in enumerate(dataset.label_names): metrics['classes'].append({ 'name': l, 'recall': r[i], 'precision': p[i], 'f1': f1[i], 'support': s[i] }) if dataset.binary: metrics['precision_recall_curve'] = precision_recall_curve( y_test, p_test) metrics['roc_curve'] = roc_curve(y_test, p_test) else: metrics['curves'] = [] for (i, l) in enumerate(dataset.label_names): metrics['curves'].append({ 'name': l, 'precision_recall_curve': precision_recall_curve(y_test[:, i], p_test[:, i]), 'roc_curve': roc_curve(y_test[:, i], p_test[:, i]) }) else: logging.getLogger(__name__).warning( "No test data available during report generation.") return metrics
def by_class_evaluation(attack_test_y, target_y, p, attack_test_x, labels=None): if labels is None: labels = np.unique(target_y) precisions = [ precision_score(attack_test_y[target_y == c], p[target_y == c]) * 100 for c in np.unique(target_y) ] accuracies = [ accuracy_score(attack_test_y[target_y == c], p[target_y == c]) * 100 for c in np.unique(target_y) ] f1_scores = [ f1_score(attack_test_y[target_y == c], p[target_y == c]) * 100 for c in np.unique(target_y) ] recalls = [ recall_score(attack_test_y[target_y == c], p[target_y == c]) * 100 for c in np.unique(target_y) ] c_train_accs = [ accuracy_score( target_y[np.logical_and(target_y == c, attack_test_y == 1)], np.argmax(attack_test_x[np.logical_and(target_y == c, attack_test_y == 1)], axis=1)) * 100 for c in np.unique(target_y) ] c_test_accs = [ accuracy_score( target_y[np.logical_and(target_y == c, attack_test_y == 0)], np.argmax(attack_test_x[np.logical_and(target_y == c, attack_test_y == 0)], axis=1)) * 100 for c in np.unique(target_y) ] x = PrettyTable() x.float_format = '.2' x.add_column("Class", labels) x.add_column('Target Accuracy Train', np.round(c_train_accs, 2)) x.add_column('Target Accuracy Test', np.round(c_test_accs, 2)) x.add_column("Attack Precision", np.round(precisions, 2)) x.add_column("Attack Accuracy", np.round(accuracies, 2)) x.add_column("Attack Recall", np.round(recalls, 2)) x.add_column("Attack F-1 Score", np.round(f1_scores, 2)) x.add_column( "Percentage of Data", np.round( np.array([ len(target_y[target_y == c]) / len(target_y) * 100 for c in np.unique(target_y) ]), 2)) print(x.get_string(title='Per Class Evaluation'))
def metric_scores(self, estimator, testt, testlabelt): y_pred = estimator.predict(testt) #secret_cm.append(accuracy_score(testlabelt, y_pred)) training_manCV.secret_cm.append( metrics.confusion_matrix(testlabelt, y_pred).flatten()) #print training_manCV.secret_cm training_manCV.secret_score.append( accuracy_score(testlabelt, y_pred)) return accuracy_score(testlabelt, y_pred)
def metric_scores(self, estimator, testt, testlabelt): y_pred = estimator.predict(testt) #secret_cm.append(accuracy_score(testlabelt, y_pred)) training_manCV.secret_cm.append( metrics.confusion_matrix(testlabelt, y_pred).flatten()) #print training_manCV.secret_cm training_manCV.secret_score.append(accuracy_score(testlabelt, y_pred)) return accuracy_score(testlabelt, y_pred)
def get_ada(): tree = DecisionTreeClassifier(max_depth=None) ada = AdaBoostClassifier(base_estimator=tree, n_estimators=300, learning_rate=0.1) ada.fit(X_train,y_train) y_train_pred = ada.predict(X_train) y_test_pred = ada.predict(X_test) ada_train_score = accuracy_score(y_train, y_train_pred) ada_test_score = accuracy_score(y_test, y_test_pred) print 'AdaBoost train/test accuracies: %.4f/%.4f' \ % (ada_train_score,ada_test_score)
def get_rf(): rf = RandomForestClassifier(n_estimators=200,max_depth=None, random_state=1,bootstrap=True) rf.fit(X_train, y_train) y_train_pred = rf.predict(X_train) y_test_pred = rf.predict(X_test) y_train_score=accuracy_score(y_train,y_train_pred) y_test_score=accuracy_score(y_test,y_test_pred) print 'Random Forest train/test accuracies: %.4f/%.4f' \ % (y_train_score,y_test_score)
def eval(self, fold, print_result=True): assert fold < self.loader.get_n_splits(), "fold >= {}".format(self.loader.get_n_splits()) X_train, y_train, X_test, y_test = self.loader.get_train_test_xy(fold) gpc = self.gpc_dict[fold] train_acc = accuracy_score(y_train, gpc.predict(X_train)) test_acc = accuracy_score(y_test, gpc.predict(X_test)) if print_result: print("Fold: {}, Kernel: {}".format(fold, gpc.kernel)) print("Train Acc: {}".format(train_acc)) print("Test Acc: {}".format(test_acc)) print("=" * 10) return train_acc, test_acc
def classification_example(): nsamples = 500 periods = 500 loss = np.zeros((periods, )) df = DataFactory() df.create_circles(nsamples) # df.create_moons(nsamples) train_x, train_y = df.get_train_samples() net = create_net(2, 2) net_loss = layers.CrossEntropyLayer() opt = optims.RMSPropOptim(net.named_parameters(), lr=options["lr"], weight_decay=options["weight_decay"], beta=options["beta"]) # begin to train. for j in range(periods): opt.zero_grad() y = net.forward(train_x) l = net_loss(y, train_y) loss[j] = l.data[0, 0] l.backward() opt.step() # plot train loss plt.plot(loss) plt.show() # plot train result with sn.no_grad(): predict_y = net(train_x) l = net_loss(predict_y, train_y) print("trian set loss:", l.item()) print( "train set accuracy score", accuracy_score(np.argmax(train_y.data, axis=1), np.argmax(predict_y.data, axis=1))) # plot test result test_x, test_y = df.get_test_samples() with sn.no_grad(): predict_y = net(test_x) l = net_loss(predict_y, test_y) print("\ntest set loss:", l.item()) print( "test set accuracy score", accuracy_score(np.argmax(test_y.data, axis=1), np.argmax(predict_y.data, axis=1)))
def run(): data = BaiduQA(conf.baiduQA_pt) train_ys, test_ys, train_xs, test_xs = data.split() print("n(train)=%d, n(test)=%d" % (train_xs.shape[0], test_xs.shape[0])) lr = LogisticRegression() print("begin training") lr.fit(train_xs, train_ys) train_predicts = lr.predict(train_xs) test_predicts = lr.predict(test_xs) train_acc = accuracy_score(train_ys, train_predicts) test_acc = accuracy_score(test_ys, test_predicts) print("train_acc=%f, test_acc=%f" % (train_acc, test_acc))
def run_experiment(clf_cls, loader, fold, print_result=True, **kwargs): X_train, y_train, X_test, y_test = loader.get_train_test_xy(fold) clf = clf_cls(**kwargs) clf.fit(X_train, y_train) train_acc = accuracy_score(y_train, clf.predict(X_train)) test_acc = accuracy_score(y_test, clf.predict(X_test)) if print_result: print("{}, fold: {}, params: {}".format(clf_cls.__name__, fold, kwargs)) print("Train Acc: {}".format(train_acc)) print("Test Acc: {}".format(test_acc)) print("=" * 10) return train_acc, test_acc
def train_repeat_forest(self, seed, train, trainlabel, test, testlabel, number_trees, number_features, repeat_times): seed_of_tree = { 'rf': RandomForestClassifier(n_estimators=number_trees, max_features=number_features), 'adb': AdaBoostClassifier(n_estimators=number_trees), 'bag': BaggingClassifier(n_estimators=number_trees), 'ext': ExtraTreesClassifier(n_estimators=number_trees, max_features=number_features), 'gbt': GradientBoostingClassifier(n_estimators=number_trees, max_features=number_features), 'bagging': RandomForestClassifier(n_estimators=number_trees, max_features=12) } rawforest = seed_of_tree[seed] score_list = [] for i in np.arange(repeat_times): forest = rawforest.fit(train, trainlabel) outputtest = forest.predict(test) accuracytrain = accuracy_score(testlabel, outputtest) score_list.append(accuracytrain) score = np.mean(score_list) return score
def agnews_bembmeans(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_agnews_data(size=sample) if sample: test_size = int(round(np.sum(2000*df.category.value_counts().values/32000))) else: test_size = 2000*4 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_sents = DataframeSentences(train_df, cols=['title', 'description']) vect = ClusteredEmbeddingsVectorizer(n_clusters=50000).fit(train_sents) train_docs = DataframeSentences(train_df, cols=['title', 'description'], flatten=True) test_docs = DataframeSentences(test_df, cols=['title', 'description'], flatten=True) X_train = vect.transform(train_docs) y_train = train_df.category X_test = vect.transform(test_docs) y_test = test_df.category model = LogisticRegression() grid = GridSearchCV(model, {'C': [.0001, .0003, .001, .003, .01, .03, .1, .3, 1, 3, 10, 30, 100]}, n_jobs=n_procs, verbose=1, cv=5) grid.fit(X_train, y_train) print(accuracy_score(y_test, grid.best_estimator_.predict(X_test)), grid.best_params_)
def testforest_confu(self, test, testlabel, forest): outputtest = forest.predict(test) accuracytrain = accuracy_score(testlabel, outputtest) #----------------------------------- print "The size of the test set is" #------------------------------------------------- print np.shape(test) #------------------------------------------------------------------------------ # print "The accuracy for the test set is %r" %accuracytrain, "and the confusion matrix is" #-------------------------- print confusion_matrix(outputtest,testlabel) #------------------------------------- #output the classification report #-------------------- print classification_report(testlabel, outputtest) #generate probability output_proba = forest.predict_proba(test) out_perfor = { 'Classprob0': output_proba[:, 0], 'Classprob1': output_proba[:, 1], 'Classprob2': output_proba[:, 1], 'output': outputtest, 'target': testlabel } outframe = DataFrame(out_perfor) # print accuracytrain # print outframe # save the outprobability # outframe.to_csv(r'D:\allprob.csv', header=0) # return outputtest # return (outframe) # print confusion_matrix(outputtest,testlabel) return accuracytrain
def deep_belief_network_prediction( self, learning_rate, training_iterations, testing_iterations=10, hidden_layer_sizes_array=[10, 10], ): accuracy_list = [] for x in range(testing_iterations): self.prepare_training_data_from_csv_data(self.csv_data) classifier = SupervisedDBNClassification( hidden_layers_structure=hidden_layer_sizes_array, learning_rate_rbm=learning_rate / 2, learning_rate=learning_rate, n_epochs_rbm=int(training_iterations / 10), n_iter_backprop=training_iterations, batch_size=256, activation_function="relu", dropout_p=0.2, ) classifier.fit(self.x_data_training, self.y_data_training) y_data_prediction = classifier.predict(self.x_data_testing) classifier_accuracy = accuracy_score(self.y_data_testing, y_data_prediction) accuracy_list.append(classifier_accuracy) return max(accuracy_list)
def test_with_unigram_tfidf(): train_x, train_y, test_x, test_y = get_features('dbn') train_x = np.array(train_x, dtype=np.float32) train_y = np.array(train_y, dtype=np.int32) test_x = np.array(test_x, dtype=np.float32) test_y = np.array(test_y, dtype=np.int32) print(test_x.shape) classifier = SupervisedDBNClassification( hidden_layers_structure=[256, 256, 256], learning_rate_rbm=0.05, learning_rate=0.1, n_epochs_rbm=10, n_iter_backprop=100, batch_size=32, activation_function='relu', dropout_p=0.2) classifier.fit(train_x, train_y) accuracies = [] f_measures = [] for i in range(1): y_pred = classifier.predict(test_x) accuracy = accuracy_score(test_y, y_pred) f_measure = f1_score(test_y, y_pred) accuracies.append(accuracy) f_measures.append(f_measure) classifier.save('SentimentClassification.pkl') print(accuracies) print('Accuracy ', mean(accuracies)) print('F-measure', mean(f_measures)) return
def printConfusionMatrix(y_true, y_pred, class_names=None): """ Print a confusion matrix similar to R's confusionMatrix """ confMatrix = classification.confusion_matrix(y_true, y_pred) accuracy = classification.accuracy_score(y_true, y_pred) print('Confusion Matrix (Accuracy {:.4f})\n'.format(accuracy)) _printConfusionMatrix(confMatrix, class_names)
def test_iris_benchmark(): data = iris() x = add_bias(data['x']) y = binarize(data['y']) train_split = [12, 39, 23, 5, 3, 29, 49, 47, 21, 30, 34, 48, 20, 45, 31, 27, 17, 22, 41, 6, 40, 38, 42, 19, 26, 15, 35, 10, 46, 25, 0, 32, 1, 16, 4, 13, 24, 33, 43, 18, 81, 65, 62, 50, 93, 92, 53, 58, 87, 55, 70, 72, 83, 56, 52, 73, 78, 64, 68, 59, 74, 89, 67, 51, 66, 98, 90, 69, 95, 63, 82, 54, 86, 85, 96, 97, 79, 71, 94, 80, 142, 147, 125, 145, 119, 101, 141, 105, 129, 138, 122, 120, 139, 124, 134, 111, 148, 117, 132, 133, 104, 130, 128, 115, 127, 131, 136, 112, 107, 143, 149, 106, 109, 108, 102, 100, 126, 103, 146, 113] test_split = [2, 7, 8, 9, 11, 14, 28, 36, 37, 44, 57, 60, 61, 75, 76, 77, 84, 88, 91, 99, 110, 114, 116, 118, 121, 123, 135, 137, 140, 144] xTrain = x[train_split, :] yTrain = y[train_split, :] xTest = x[test_split, :] yTest = y[test_split, :] model = OVRClassifier(LogisticModel(rho=1.)).train(xTrain, yTrain, verbose=False) pred = binarize(model.predict(xTest)) assert_almost_equal(accuracy_score(yTest, pred), 0.96667, decimal=3)
def predict(self, model, X, y): predictions = model.predict_proba(X) if np.isfinite(y).all(): self.accuracy.append( accuracy_score(y, np.argmax(predictions, axis=1))) # print('Accuracy: ', accuracy_score(y, np.argmax(predictions, axis=1))) return predictions
def print_test_results(true_labels, pred_labels, pred_probs): """ 输出预测结果,包括准确率和AUC值 """ print '预测准确率:%.2f' % accuracy_score(true_labels, pred_labels) print '预测AUC值:%.4f' % roc_auc_score(true_labels, pred_probs[:, 1]) print
def multi_class_measures(cls, y_true: list, y_predicted: list) -> OrderedDict: """Assessment measures of a classification task with multiple classes i.e. multi-label and or multi-class task Parameters ---------- y_true : list Expected class labels in binary form y_predicted : list Predicted class labels in binary form Returns ------- OrderedDict An ordered dictionary of assessment measures """ measures = OrderedDict() measures['accuracy'] = accuracy_score(y_true, y_predicted) measures['coverage error'] = coverage_error(y_true, y_predicted) measures['label ranking loss'] = label_ranking_loss( y_true, y_predicted) b_true = np.array(y_true) b_pred = np.array(y_predicted) measures['unsupported hamming loss'] = np.sum( np.not_equal(b_true, b_pred)) / float(b_true.size) measures[ 'label ranking average precision'] = label_ranking_average_precision_score( y_true, y_predicted) return measures
def main(): diabetes = datasets.fetch_openml('diabetes') y = sklearn.preprocessing.LabelEncoder().fit_transform(diabetes['target']) X_train, X_test, y_train, y_test = train_test_split(diabetes['data'], y) preds = [] accs = [] for x in range(0, 500): model = DecisionTreeClassifier() X_train_cur, _, y_train_cur, _ = train_test_split(X_train, y_train) model.fit(X_train_cur, y_train_cur) y_hat = model.predict_proba(X_test) preds.append(y_hat) acc = accuracy_score(np.argmax(np.mean(preds, axis=0), axis=1), y_test) accs.append(acc) print(acc) plt.plot(np.arange(1, 501), accs, label='Accuracy') plt.savefig('../figures/ensemble.pdf')
def _validate_model(self, x: np.ndarray, y: np.ndarray, validation_file_name: str = "validation.json") -> dict: logging.info("Creating predictions ...") y_predicted_categories = self._model.predict(x, batch_size=self._batch_size) gc.collect() from sklearn.metrics.classification import accuracy_score, precision_recall_fscore_support y_expected_1dim = self._label_enc.max_category(y) y_predicted_1dim = self._label_enc.max_category(y_predicted_categories) logging.info("Results:") logging.info("{}".format(precision_recall_fscore_support(y_true=y_expected_1dim, y_pred=y_predicted_1dim))) accuracy = accuracy_score(y_true=y_expected_1dim, y_pred=y_predicted_1dim) logging.info("{}".format(accuracy)) from sklearn.metrics.classification import classification_report logging.info("\n{}".format(classification_report(y_true=y_expected_1dim, y_pred=y_predicted_1dim, target_names=["neg", "pos"], ))) results = classification_report(y_true=y_expected_1dim, y_pred=y_predicted_1dim, target_names=["neg", "pos"], output_dict=True) results["accuracy"] = accuracy write_text_file( file_path=self._experiment_folder / validation_file_name, text=json.dumps(results)) return results
def yelpstars_bembmeans(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_yelp_stars_data(size=sample) if sample: test_size = floor(len(df) * 1./14) else: test_size = 10000*len(df.stars.unique()) split = StratifiedShuffleSplit(df.stars, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_sents = DataframeSentences(train_df, cols=['text']) vect = ClusteredEmbeddingsVectorizer(n_clusters=50000).fit(train_sents) train_docs = DataframeSentences(train_df, cols=['text'], flatten=True) test_docs = DataframeSentences(test_df, cols=['text'], flatten=True) X_train = vect.transform(train_docs) y_train = train_df.stars X_test = vect.transform(test_docs) y_test = test_df.stars model = LogisticRegression() grid = GridSearchCV(model, {'C': [.0001, .0003, .001, .003, .01, .03, .1, .3, 1, 3, 10, 30, 100]}, n_jobs=n_procs, verbose=1, cv=5) grid.fit(X_train, y_train) print(accuracy_score(y_test, grid.best_estimator_.predict(X_test)), grid.best_params_)
def debug_accuracy(classifier, x, y, examples): predictions = classifier.predict(x) errors = [] false_positive = 0 false_negative = 0 for i in range(len(list(x))): if predictions[i] != y[i]: errors.append((examples[i], y[i], predictions[i])) if predictions[i] == 1: false_positive += 1 else: false_negative += 1 for i in range(50): print( "True Label: %s, Prediction: %s, Data: %s, \t Original Data: %s" % (errors[i][1], errors[i][2], add_features( errors[i][0]), errors[i][0])) print("Accuracy: %f" % accuracy_score(y, predictions)) print("False positive: %s \t False negative: %s" % (false_positive, false_negative))
def main(): ks = [3, 5, 10, 20] mapk = 200 train, test = load_data() train, test = train.as_matrix(), test.as_matrix() x = train.T res = np.zeros(9) for u in xrange(train.shape[0]): y = x[:, u] truth = test[u] clf = LogisticRegression(random_state=42, C=0.001, solver='lbfgs') clf.fit(x, y) pred_buy_proba = clf.predict_proba(x)[:, 1].ravel() pruned_buy_proba = pred_buy_proba - y.ravel() pred_order = pruned_buy_proba.argsort()[::-1] actual_bought = truth.nonzero()[0] score = apk(actual_bought, pred_order, mapk) tmp = [score] for k in ks: tmp.append(prec(actual_bought, pred_order, k)) tmp.append(recall(actual_bought, pred_order, k)) res += np.array(tmp) if u % 50 == 0: print res / (u + 1) print u, classification.accuracy_score(clf.predict(x), y) return res / (u + 1)
def Predict(self, inp, labels, classifier, folds, name, paramdesc): X= inp y = labels X, y = X[y != 2], y[y != 2] n_samples, n_features = X.shape ############################################################################### # Classification and ROC analysis # Run classifier with cross-validation and plot ROC curves cv = StratifiedKFold(y, n_folds=folds) mean_tpr = 0.0 mean_fpr = np.linspace(0, 1, 100) all_tpr = [] _precision = 0.0 _recall = 0.0 _accuracy = 0.0 _f1 = 0.0 for i, (train, test) in enumerate(cv): probas_ = classifier.fit(X[train], y[train]).predict_proba(X[test]) pred_ = classifier.predict(X[test]) _precision += precision_score(y[test], pred_) _recall += recall_score(y[test], pred_) _accuracy += accuracy_score(y[test], pred_) _f1 += f1_score(y[test], pred_) # Compute ROC curve and area the curve fpr, tpr, thresholds = roc_curve(y[test], probas_[:, 1]) mean_tpr += interp(mean_fpr, fpr, tpr) mean_tpr[0] = 0.0 roc_auc = auc(fpr, tpr) plt.plot(fpr, tpr, lw=1, label='ROC fold %d (area = %0.2f)' % (i, roc_auc)) _precision /= folds _recall /= folds _accuracy /= folds _f1 /= folds plt.plot([0, 1], [0, 1], '--', color=(0.6, 0.6, 0.6), label='Luck') mean_tpr /= len(cv) mean_tpr[-1] = 1.0 mean_auc = auc(mean_fpr, mean_tpr) plt.plot(mean_fpr, mean_tpr, 'k--', label='Mean ROC (area = %0.2f)' % mean_auc, lw=2) plt.xlim([-0.05, 1.05]) plt.ylim([-0.05, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic - {0}'.format(name)) plt.legend(loc="lower right") plt.savefig(self.configObject['outputdir'] + '/' + name + '.png') plt.close() result = self.OutputResult(name, paramdesc, len(inp), floor(labels.size / folds), _precision, _recall, _accuracy, _f1) Announce(result)
def sogou_bwords(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_sogou_data(size=sample) input = [ ' '.join([title, content]) for title, content in zip(df.contenttitle.values, df.content.values) ] target = df.cat_en if sample: test_size = int( round(np.sum(12000 * df.cat_en.value_counts().values / 102000))) else: test_size = 12000 * 5 X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size) grid = bag_words_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def get_cv_metrics(self, cv): fold_avg_p = [] fold_avg_r = [] fold_avg_f1 = [] fold_accuracy = [] fold_test_support = [] fold_train_support = [] for i, (train, test) in enumerate(cv): train_df, train_y = self.X.iloc[train], self.y.iloc[train] test_df, test_y = self.X.iloc[test], self.y.iloc[test] estimator = clone(self.pipeline) estimator.fit(train_df, train_y) y_pred = estimator.predict(test_df) p, r, f1, s = precision_recall_fscore_support(test_y, y_pred) accuracy = accuracy_score(test_y, y_pred) # support weighted average precision,recall,f1,support across classes avg_p, avg_r, avg_f1 = (np.average(p, weights=s), np.average(r, weights=s), np.average(f1, weights=s)) test_support = test_y.shape[0] train_support = train_y.shape[0] fold_avg_p.append(avg_p) fold_avg_r.append(avg_r) fold_avg_f1.append(avg_f1) fold_accuracy.append(accuracy) fold_test_support.append(test_support) fold_train_support.append(train_support) return np.average(fold_avg_p), np.average(fold_avg_r), np.average( fold_avg_f1), np.average(fold_accuracy), np.average( test_support), np.average(train_support)
def evaluate_special(self, session: tf.Session, val_generator, batch_size: int, classification_samples, size, emnist=True, class_weights=None): test_acc = [] samples_per_shot = 6200 total_data_processed = 0.0 correct = 0.0 correct_avg = 0.0 # stuff for the weighted accuracy predictions = [] sample_weights = [] ground_truth = [] for data, labels in val_generator(batch_size): data = data.reshape((data.shape[0], 28, 28, 1)) print('[INFO] processing', total_data_processed, 'of', size) # calssify a single sample for i in range(len(data)): if emnist: x1, y1 = classification_samples(samples_per_shot // 62) else: x1, y1 = classification_samples(samples_per_shot // 10) x2 = np.asarray([list(data[i])] * len(y1)) pc = session.run([self.sigmoidal_out], feed_dict={ self.X1: x1, self.X2: x2 }) prediction = y1[np.argmin(pc)] prediction_avg = self._get_mean_prediction(np.squeeze(pc), y1, emnist=emnist) if prediction == labels[i]: correct += 1.0 if prediction_avg == labels[i]: correct_avg += 1.0 predictions.append(prediction) sample_weights.append(class_weights[labels[i]]) ground_truth.append(labels[i]) total_data_processed += 1.0 # keep track of loss and accuracy try: weighted_acc = accuracy_score(ground_truth, predictions, True, sample_weights) except: weighted_acc = None print('weighted_acc:', weighted_acc) accuracy = correct / total_data_processed avg_acc = correct_avg / total_data_processed return accuracy, avg_acc, weighted_acc
def agnews_bngrams(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_agnews_data(size=sample) input = [ ' '.join([title, descr]) for title, descr in zip(df.title.values, df.description.values) ] target = df.category if sample: test_size = int( round(np.sum(2000 * df.category.value_counts().values / 32000))) else: test_size = 2000 * 4 X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size) grid = bag_ngram_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def train_model(base_model,X,y,minm_image_process=None,threshold_accuracy=.9,classes=range(10),dump_file_path=None): """ incremental training module returns a new model after partial fit on give data X=128 sized vector y=labels of vectors minm_image_process='how many images of a specific label have to be trained, oversampling undersampling is done, classes:number of that is going to be used in this model have to defined in advance """ print("entering training module") [X_train,X_test,y_train,y_test]=get_stratified_sample(X,y,verbose=False) if minm_image_process is not None: [X_processed,y_processed]=process_data(X_train,y_train,minm_num=minm_image_process) else: [X_processed,y_processed]=[X_train,y_train] if dump_file_path is not None: pickle.dump([X_processed,y_processed],open(dump_file_path+'_resampled.pickle','wb')) accuracy=0 idx=0 while accuracy<threshold_accuracy: try: base_model.partial_fit(X_processed,y_processed) except Exception as e: print(e) base_model.partial_fit(X_processed,y_processed,classes=classes) y_pred=base_model.predict(X_test) accuracy=classification.accuracy_score(y_test,y_pred) print("accuracy in iteration ",idx+1,' is =',accuracy) idx+=1 if idx>10: break print("returning from train module") return base_model
def get_training_results(self, database_file, dataset, cross_validation=10): # Connect DB self.apk_db.connect_db(database_file) results = [] # K-Fold Cross Validation kf = KFold(n_splits=cross_validation, shuffle=True) for train, test in kf.split(dataset): # Get training and testing dataset training_dataset = [dataset[i] for i in train] testing_dataset = [dataset[i] for i in test] # Fit model self.fit(training_dataset) # Predict labels for testing samples testing_labels, predicted_labels = self.i_predict(testing_dataset) # Get score result = {} result['accuracy'] = accuracy_score(testing_labels, predicted_labels, True) result['f-score'] = f1_score(testing_labels, predicted_labels) results.append(result) # Disconnect DB self.apk_db.disconnect_db() return results
def calc_fit(model, metric, train_x, train_y, test_x, test_y, p): train_x = map(lambda x: list(compress(x, p)), train_x) test_x = map(lambda x: list(compress(x, p)), test_x) clf = model.fit(train_x, train_y) predictions = clf.predict(test_x) if metric == 'precision': return precision_score(test_y, predictions, [0, 1]) elif metric == 'recall': return recall_score(test_y, predictions, [0, 1]) elif metric == 'accuracy': return accuracy_score(test_y, predictions, [0, 1]) return precision_score(test_y, predictions, [0, 1]) + recall_score(test_y, predictions, [0, 1]) + accuracy_score(test_y, predictions, [0, 1])
def analyse(self, inDir, fileStem=None, hidden=False, tag=None, clear=True, projects=None): meta = self._getMeta(inDir, fileStem) if clear: meta.drop("project_analysis") self.predictions = None if "prediction" in meta.db: self.predictions = {x["example"]:x["predicted"] for x in meta.db["prediction"].all()} #print predictions self.grouped = {} for example in meta.db.query("SELECT * FROM example"): projectCode = example["project_code"] if projects and projectCode not in projects: continue self._addToProject(example, example["project_code"]) self._addToProject(example, "all projects") rows = [] for project in sorted(self.grouped.keys()): for setName in ("train", "hidden"): labels = self.grouped[project][setName]["labels"] groups = self.grouped[project][setName]["groups"] predictions = self.grouped[project][setName]["predictions"] row = OrderedDict([("project",project), ("setName", setName), ("tag", tag)]) row["examples"] = len(labels) row["pos"] = len([x for x in labels if x > 0]) row["neg"] = len([x for x in labels if x < 0]) row["majority"] = None if row["pos"] > 0 or row["neg"] > 0: row["majority"] = max(set(labels), key=labels.count) row["auc_baseline"] = None row["auc"] = None #row["bas_baseline"] = None #row["bas"] = None row["accuracy"] = None row["accuracy_baseline"] = None if row["pos"] > 0 and row["neg"] > 0: majorityPredictions = getMajorityPredictions(labels, groups) row["auc"] = aucForPredictions(labels, self.grouped[project][setName]["predictions"]) row["auc_baseline"] = aucForPredictions(labels, majorityPredictions) #row["bas"] = balanced_accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in predictions]) #row["bas_baseline"] = majorityBaseline(labels, [(-1.0 if x < 0 else 1.0) for x in majorityPredictions]) row["accuracy"] = accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in predictions]) row["accuracy_baseline"] = accuracy_score(labels, [(-1.0 if x < 0 else 1.0) for x in majorityPredictions]) rows.append(row) meta.insert_many("project_analysis", rows, True)
def dbpedia_bngrams(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() X, X_, y, y_ = dbpedia_train_test_split(sample=sample) grid = bag_ngram_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def test_classification(self, test, testlabel,bestmodel): # bestmodel=bestmodel outputtest = bestmodel.predict(test) accuracytest = accuracy_score(testlabel, outputtest) print ("The accuracy for the test set is %r" %accuracytest, "and the confusion matrix is") print (confusion_matrix(outputtest,testlabel)) print( classification_report(testlabel, outputtest)) # probaout=bestmodel.predict_prob(test) # probaout= DataFrame(probaout) # print probaout return outputtest
def estimateAccuracy(model, limit): asTrain, asTest = split("../data/train.csv", limit) model.fit(asTrain) testY = [ x.Y for x in asTest ] testPredictions = model.predict(asTest) print("%f" % (accuracy_score(testY, testPredictions))) print confusion_matrix(testY, testPredictions)
def score(self, X, y, sample_weight=None): from commonml.skchainer.classifier import Classifier from commonml.skchainer.regressor import Regressor if isinstance(self.model, Classifier): from sklearn.metrics.classification import accuracy_score return accuracy_score(y, self.predict(X), sample_weight=sample_weight) elif isinstance(self.model, Regressor): from sklearn.metrics.regression import r2_score return r2_score(y, self.predict(X), sample_weight=sample_weight, multioutput='variance_weighted') else: raise ValueError('Unsupported model.')
def evaluacion_train_test(): X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.02, random_state=0) # Creacion y entrenamiento del clasificador rfc = RFC(n_estimators=100,n_jobs=-1) rfc.fit(X_train, y_train) # Prediccion de la etiqueta del test y evaluacion y_pred = rfc.predict(X_test) y_pred_proba = rfc.predict_proba(X_test) return accuracy_score(y_test, y_pred),rfc
def dbpedia_convgemb(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] train_docs = DataframeSentences(train_df, cols=['title', 'abstract'], flatten=True) vocab = Dictionary(train_docs) vocab.filter_extremes(keep_n=5000) bin = LabelBinarizer() x_train = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in train_docs], max_length=100, padding_word=0)) y_train = bin.fit_transform(train_df.category.values) test_docs = DataframeSentences(test_df, cols=['title', 'abstract'], flatten=True) x_test = np.array(pad_sentences([[vocab.token2id[tok] + 1 for tok in s if tok in vocab.token2id] for s in test_docs], max_length=100, padding_word=0)) y_test = bin.transform(test_df.category.values) emb_weights = load_w2v_weights(vocab) model = Sequential() model.add(Embedding(5001, 300, input_length=100, dropout=.2, weights=[emb_weights], trainable=False)) model.add(Convolution1D(nb_filter=50, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=model.output_shape[1])) model.add(Flatten()) model.add(Dense(100, activation='relu')) model.add(Dropout(.2)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) model.fit(x_train, y_train) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
def four_algorythms(algorythm, n_est): if algorythm == "RandomForestClassifier": prediction = RandomForestClassifier(n_estimators=n_est) if algorythm == "ExtraTreesClassifier": prediction = ExtraTreesClassifier(n_estimators=n_est) if algorythm == "AdaBoostClassifier": prediction = AdaBoostClassifier(n_estimators=n_est) if algorythm == "GradientBoostingClassifier": prediction = GradientBoostingClassifier(n_estimators=n_est) prediction = prediction.fit(train, train_y) validate_y = np.concatenate((np.ones(50), np.zeros(50))) predicted_y = prediction.predict(validate) return(accuracy_score(validate_y, predicted_y))
def __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name, args): probablistic_predictions = False if args.predict_proba: predict_proba_func = getattr(clf, "predict_proba", None) if predict_proba_func is not None: probablistic_predictions = True prob_predictions = clf.predict_proba(x_test) predictions = [] pos_predictions = [] for prediction in prob_predictions: pos_predictions.append(prediction[1]) if prediction[1] > args.predict_threshold: predictions.append(1) else: predictions.append(-1) pos_predictions = np.array(pos_predictions) mean_confidence = np.mean(pos_predictions) max_confidence = max(pos_predictions) min_confidence = min(pos_predictions) print "Mean confidence: " + str(mean_confidence) print "Max confidence: " + str(max_confidence) print "Min confidence: " + str(min_confidence) predictions = np.array(predictions) else: predictions = clf.predict(x_test) else: predictions = clf.predict(x_test) precision = precision_score(y_test, predictions, [-1, 1]) recall = recall_score(y_test, predictions, [-1, 1]) auc_score = roc_auc_score(y_test, predictions, None) accuracy = accuracy_score(y_test, predictions) print "Train/test set sizes: " + str(len(x_train)) + "/" + str(len(x_test)) print "Precision is: " + str(precision) print "Recall is: " + str(recall) print "AUC ROC Score is: " + str(auc_score) print "Accuracy is: " + str(accuracy) true_count = len([1 for p in predictions if p == 1]) actual_count = len([1 for y in y_test if y == 1]) print "True count (prediction/actual): " + str(true_count) + "/" + str(actual_count) if args.write_to_log: # Write out results as a table to log file write_log(out_file_name=out_file_name, args=args, classifier=classifier, precision=precision, recall=recall, true_count=true_count, actual_count=actual_count, X_train=x_train, X_test=x_test, auc=auc_score, accuracy=accuracy, probablistic_prediction=probablistic_predictions, prediction_threshold=args.predict_threshold)
def testforest(self, test, testlabel,forest): outputtest= forest.predict(test) accuracytrain = accuracy_score(testlabel, outputtest) print "The size of the test set is" print np.shape(test) print "The accuracy for the test set is %r" %accuracytrain, "and the confusion matrix is" #print confusion_matrix(outputtest,testlabel) print classification_report(testlabel, outputtest) # generate probability outputproba=forest.predict_proba(test) outperfor={'prob0':outputproba[:,0],'prob1':outputproba[:,1],'output':outputtest,'target':testlabel} outframe=DataFrame(outperfor) print outframe #outframe.to_csv(r'D:\allprob.csv', header=0) return accuracytrain, outframe
def train(self, data, target, deep): 'En esta funcion se realiza 10-Fold CV para entrenar la red con una expansion de entre 20-75%.' 'El algoritmo de entrenamiento es Descenso por Gradiente Estocastico.' # 10-Fold Cross Validation folds = 10; iters = 10; kf = KFold(data.shape[0], n_folds=folds) if deep: hiddenNodes = np.arange(data.shape[1],2*data.shape[1])+1 else: hiddenNodes = np.arange(data.shape[1],10*data.shape[1])+1 hiddenNodes = hiddenNodes[hiddenNodes>0] Error_HNodes = [] Nets_HNodes = [] for j in hiddenNodes: self.setHiddenNodes([j]) Mean_error_iter = [] Mean_nets_iter = [] for train_index, val_index in kf: X, Xval = data[train_index], data[val_index] T, Tval = target[train_index], target[val_index] Error_iter = [] Nets_iter = [] for i in np.arange(iters): self.initialization() # Inicializaciones comunes Out,H,N = self.sim(X) H = H[-1] self.Weights[-1] = np.dot(pinv(H),T) # Validation Out_val,H_val,N_val = self.sim(Xval) # Se guarda el error y la red # MSE = [mean_squared_error(Tval,Out_val)] # Error de clasificacion Error = [accuracy_score(Tval, Out_val)] #Error = [f1_score(Tval, Out_val)] Networks = [self.Weights] Error_iter.append(np.min(Error)) Nets_iter.append(Networks[np.argmin(Error)]) Mean_error_iter.append(np.mean(Error_iter)) Mean_nets_iter.append(Nets_iter[np.argmin(Error_iter)]) Error_HNodes.append(np.mean(Mean_error_iter)) Nets_HNodes.append(Mean_nets_iter[np.argmin(Mean_error_iter)]) self.Weights = Nets_HNodes[np.argmin(Error_HNodes)] Final_Error = np.min(Error_HNodes) selected_Nodes = hiddenNodes[np.argmin(Error_HNodes)] self.setHiddenNodes([selected_Nodes]) return Final_Error
def resh(classifier, x): if classifier == "RandomForestClassifier": pred = RandomForestClassifier(n_estimators=x) if classifier == "ExtraTreesClassifier": pred = ExtraTreesClassifier(n_estimators=x) if classifier == "AdaBoostClassifier": pred = AdaBoostClassifier(n_estimators=x) if classifier == "GradientBoostingClassifier": pred = GradientBoostingClassifier(n_estimators=x) pred = pred.fit(train, train_y) validate_y = np.concatenate((np.ones(146), np.zeros(112))) predicted_y = pred.predict(validate) return(accuracy_score(validate_y, predicted_y))
def trainforest(self, seed, train, trainlabel, number_trees, accuracy_train_calculation = False): seed_of_tree = {'rf': RandomForestClassifier(n_estimators= number_trees, max_features=8), 'adb': AdaBoostClassifier(n_estimators= number_trees), 'bag': BaggingClassifier(n_estimators= number_trees, max_features=8), 'ext': ExtraTreesClassifier(n_estimators= number_trees, max_features=8), 'gbt': GradientBoostingClassifier(n_estimators= number_trees, max_features=8)} rawforest=seed_of_tree[seed] forest=rawforest.fit(train,trainlabel) outputtrain= forest.predict(train) print "The size of the training set is %r , %r" %(np.shape(train)[0],np.shape(train)[1]) if accuracy_train_calculation == True : accuracytrain = accuracy_score(trainlabel, outputtrain) print "The accuracy for the training set is %r" %accuracytrain #---------------------------------------- print "The method is %r" %seed # print "The accuracy for the training set is %r" %accuracytrain, "and the confusion matrix is" #------------------------ print confusion_matrix(outputtrain,trainlabel) return forest
def agnews_bngrams(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_agnews_data(size=sample) input = [' '.join([title, descr]) for title, descr in zip(df.title.values, df.description.values)] target = df.category if sample: test_size = int(round(np.sum(2000*df.category.value_counts().values/32000))) else: test_size = 2000*4 X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size) grid = bag_ngram_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def yelpstars_bngrams(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_yelp_stars_data(size=sample) input = df.text target = df.stars if sample: test_size = floor(len(df) * 1./14) else: test_size = 10000*len(df.stars.unique()) X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size) grid = bag_ngram_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def sogou_bwords(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_sogou_data(size=sample) input = [' '.join([title, content]) for title, content in zip(df.contenttitle.values, df.content.values)] target = df.cat_en if sample: test_size = int(round(np.sum(12000*df.cat_en.value_counts().values/102000))) else: test_size = 12000*5 X, X_, y, y_ = train_test_split(input, target, stratify=target, test_size=test_size) grid = bag_words_grid(n_procs) grid.fit(X, y) print(accuracy_score(y_, grid.best_estimator_.predict(X_)), grid.best_params_)
def fineTuning(self,data,target): # Una vez establecidos todos los pesos, se procede al ajuste fino epoch = 0 Error = [] Networks = [] while epoch <= 10: Out,H,N = self.sim(data) H = H[-1] pseudoinverse = pinv(H) beta = np.dot(pseudoinverse,target) self.Weights[-1] = beta # Validation Out,H,N = self.sim(data) # Error de regresion. MSE #Error.append(mean_squared_error(data,Out)) Networks.append(self.Weights) # Error de clasificacion Error.append(accuracy_score(target, Out)) #Error.append(f1_score(target, Out)) epoch += 1 Final_Error = np.min(Error) self.Weights = Networks[np.argmin(Error)] return Final_Error
def __print_and_log_results(clf, classifier, x_train, x_test, y_test, out_file_name, args): predictions = clf.predict(x_test) precision = precision_score(y_test, predictions, [-1, 1]) recall = recall_score(y_test, predictions, [-1, 1]) auc_score = roc_auc_score(y_test, predictions, None) accuracy = accuracy_score(y_test, predictions) print "Train/test set sizes: " + str(len(x_train)) + "/" + str(len(x_test)) print "Precision is: " + str(precision) print "Recall is: " + str(recall) print "AUC ROC Score is: " + str(auc_score) print "Accuracy is: " + str(accuracy) true_count = len([1 for p in predictions if p == 1]) actual_count = len([1 for y in y_test if y == 1]) print "True count (prediction/actual): " + str(true_count) + "/" + str(actual_count) if args.write_to_log: # Write out results as a table to log file write_log(out_file_name=out_file_name, args=args, classifier=classifier, precision=precision, recall=recall, true_count=true_count, actual_count=actual_count, X_train=x_train, X_test=x_test, auc=auc_score, accuracy=accuracy)
train = np.concatenate((train_meme_dogs, train_snuffle_dogs)) validate = np.concatenate((validate_meme_dogs, validate_snuffle_dogs)) train_y = np.concatenate((np.ones(150), np.zeros(150))) list_of_n_estimators = (10, 20, 40, 80, 100, 150, 200, 300, 400, 500, 1000) list_of_forests = (RandomForestClassifier, ExtraTreesClassifier, AdaBoostClassifier, GradientBoostingClassifier) a = [] for j in list_of_forests: for i in list_of_n_estimators: random_forest = j(n_estimators=i) random_forest = random_forest.fit(train, train_y) validate_y = np.concatenate((np.ones(39), np.zeros(39))) predicted_y = random_forest.predict(validate) print(accuracy_score(validate_y, predicted_y)) a.append(accuracy_score(validate_y, predicted_y)) with open('forests_table.txt', 'w') as output_trees: output_trees.write('Algorithm\tn=10\tn=20\tn=40\tn=80\tn=100\tn=150\tn=200\tn=300\tn=400\tn=500\tn=1000\n') output_trees.write('RandomForestClassifier\t') for res in a[0:11]: output_trees.write("%s " % res + '\t') output_trees.write('\n') output_trees.write('ExtraTreesClassifier\t') for res in a[11:22]: output_trees.write("%s " % res + '\t') output_trees.write('\n') output_trees.write('AdaBoostClassifier\t') for res in a[22:33]: output_trees.write("%s " % res + '\t')
# читаем обучающее множество X_train, y_train, lengths_train = load_conll(open("../resources/train.data", "r"), features) clf = StructuredPerceptron(decode="viterbi", lr_exponent=.05, max_iter=30) print("Fitting model " + str(clf)) clf.fit(X_train, y_train, lengths_train) print("\nPredictions on dev set") # читаем отладочное множество X_dev, y_dev, lengths_dev = load_conll(open("../resources/dev.data", "r"), features) y_pred = clf.predict(X_dev, lengths_dev) print("Whole seq accuracy ", whole_sequence_accuracy(y_dev, y_pred, lengths_dev)) print("Element-wise accuracy ", accuracy_score(y_dev, y_pred)) print("Mean F1-score macro ", f1_score(y_dev, y_pred, average="macro")) print("\nPredictions on test set") # читаем тестовое множество X_test, _, lengths_test = load_conll(open("../resources/test.data", "r"), features) y_pred = clf.predict(X_test, lengths_test) print(pd.Series(y_pred).value_counts()) print("Saving predicted as a submission") with open("submission.csv", "w") as wf: wf.write("id,tag\n") for id, tag in enumerate(list(y_pred)):
def dbpedia_smallcharconv(sample=None, n_procs=None): if not n_procs: n_procs = cpu_count() df = get_dbpedia_data(size=sample) if sample: test_size = int(round(np.sum(5000 * df.category.value_counts().values / 45000))) else: test_size = 5000 * 14 logging.info('creating train test split ...') split = StratifiedShuffleSplit(df.category, test_size=test_size) train_split, test_split = next(iter(split)) train_df = df.iloc[train_split] test_df = df.iloc[test_split] logging.info('preprocessing, padding and binarizing data ...') train_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in train_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] bin = LabelBinarizer() x_train = np.array(pad_sentences(train_docs, max_length=1014, padding_word=CHAR_MAP.index(' '))) y_train = bin.fit_transform(train_df.category.values) test_docs = [[CHAR_MAP.index(c) if c in CHAR_MAP else len(CHAR_MAP) for c in text] for text in test_df[['title', 'abstract']].apply(lambda cols: u'\n'.join(cols), axis=1).values] x_test = np.array(pad_sentences(test_docs, max_length=1014, padding_word=0)) y_test = bin.transform(test_df.category.values) logging.info('building model ...') model = Sequential() model.add(Embedding(len(CHAR_MAP) + 1, len(CHAR_MAP) + 1, input_length=1014, weights=[char_embedding()], trainable=False)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu')) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=7, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(Convolution1D(nb_filter=256, filter_length=3, border_mode='valid', activation='relu', subsample_length=1)) model.add(MaxPooling1D(pool_length=3)) model.add(Flatten()) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(1024, activation='relu')) model.add(Dropout(.5)) model.add(Dense(14, activation='sigmoid')) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['categorical_accuracy']) print(model.summary()) model.fit(x_train, y_train, batch_size=64, nb_epoch=5, validation_data=[x_test, y_test]) print(accuracy_score(np.argwhere(y_test)[:,1], model.predict_classes(x_test)))
validate_y = np.concatenate((np.ones(38), np.zeros(38))) results = open("scores.txt", "w") results.write("n_estimators" + "\t") for i in accuracy: results.write(str(i) + "\t") results.write("\n") for classifier in algorithms: results.write(classifier.__name__ + "\t") for score in accuracy: algorithm = classifier(n_estimators=score) algorithm = algorithm.fit(train, train_y) predicted_y = algorithm.predict(validate) acc_score = accuracy_score(validate_y, predicted_y) results.write(str(acc_score) + "\t") print("done for " + classifier.__name__ + "with n_estimators" + str(score) + " " + str(acc_score)) results.write("\n") # I`ve got the biggest accuracy score (0.8684) with AdaBoostClassifier with n_estimators=300 ada = AdaBoostClassifier(n_estimators=300) ada_train = ada.fit(train, train_y) predicted_y = ada_train.predict(validate) acc_score = accuracy_score(validate_y, predicted_y) print(acc_score) unknown = get_images("unknown") un_predicted = ada_train.predict(unknown)
X_test_img = [] for idx in ids_test: product = json.loads(data[idx]['product']) X_test_img.append(data[idx]['image_emb']) description = product['Description'] tokenized = word_tokenize(description) tfidf = np.zeros(vocab_size) for w,c in Counter(tokenized).iteritems(): if w in vocab: tfidf[vocab_dict[w]] = idfs[w] * float(c) / len(tokenized) X_test_txt.append(tfidf) X_test_txt = np.array(X_test_txt) X_test_img = np.array(X_test_img) # Training for cat, (y_train, y_test) in enumerate(zip(y_trains, y_tests)): lr_txt = LogisticRegression() lr_img = LogisticRegression() lr_txt.fit(X_train_txt, y_train) lr_img.fit(X_train_img, y_train) classes = lr_img.classes_ p_txt = lr_txt.predict_proba(X_train_txt) p_img = lr_img.predict_proba(X_train_img) p = p_img + p_txt train_score = 100*accuracy_score(classes[p.argmax(axis=1)], y_train) p_txt = lr_txt.predict_proba(X_test_txt) p_img = lr_img.predict_proba(X_test_img) p = p_img + p_txt test_score = 100*accuracy_score(classes[p.argmax(axis=1)], y_test) fwrite('Category %d:\n\tTrain score = %2.1f%%\n\tTest score = %2.1f%%\n\n' % (cat+1,train_score, test_score))
# Specify Gaussian Processes with fixed and optimized hyperparameters gp_fix = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0), optimizer=None) gp_fix.fit(X[:train_size], y[:train_size]) gp_opt = GaussianProcessClassifier(kernel=1.0 * RBF(length_scale=1.0)) gp_opt.fit(X[:train_size], y[:train_size]) print("Log Marginal Likelihood (initial): %.3f" % gp_fix.log_marginal_likelihood(gp_fix.kernel_.theta)) print("Log Marginal Likelihood (optimized): %.3f" % gp_opt.log_marginal_likelihood(gp_opt.kernel_.theta)) print("Accuracy: %.3f (initial) %.3f (optimized)" % (accuracy_score(y[:train_size], gp_fix.predict(X[:train_size])), accuracy_score(y[:train_size], gp_opt.predict(X[:train_size])))) print("Log-loss: %.3f (initial) %.3f (optimized)" % (log_loss(y[:train_size], gp_fix.predict_proba(X[:train_size])[:, 1]), log_loss(y[:train_size], gp_opt.predict_proba(X[:train_size])[:, 1]))) # Plot posteriors plt.figure(0) plt.scatter(X[:train_size, 0], y[:train_size], c='k', label="Train data", edgecolors=(0, 0, 0)) plt.scatter(X[train_size:, 0], y[train_size:], c='g', label="Test data", edgecolors=(0, 0, 0)) X_ = np.linspace(0, 5, 100) plt.plot(X_, gp_fix.predict_proba(X_[:, np.newaxis])[:, 1], 'r', label="Initial kernel: %s" % gp_fix.kernel_)