def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # We shouldn't forget any metrics assert_equal(set(SYMETRIC_METRICS).union(set(NOT_SYMETRIC_METRICS)), set(ALL_METRICS)) assert_equal(set(SYMETRIC_METRICS).intersection(set(NOT_SYMETRIC_METRICS)), set([])) # Symmetric metric for name, metric in SYMETRIC_METRICS.items(): assert_equal(metric(y_true, y_pred), metric(y_pred, y_true), msg="%s is not symetric" % name) # Not symmetric metrics for name, metric in NOT_SYMETRIC_METRICS.items(): assert_true(metric(y_true, y_pred) != metric(y_pred, y_true), msg="%s seems to be symetric" % name) # Deprecated metrics with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False)) assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true))
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # We shouldn't forget any metrics assert_equal(set(SYMETRIC_METRICS).union(NOT_SYMETRIC_METRICS, THRESHOLDED_METRICS), set(ALL_METRICS)) assert_equal(set(SYMETRIC_METRICS).intersection(set(NOT_SYMETRIC_METRICS)), set([])) # Symmetric metric for name, metric in SYMETRIC_METRICS.items(): assert_equal(metric(y_true, y_pred), metric(y_pred, y_true), msg="%s is not symetric" % name) # Not symmetric metrics for name, metric in NOT_SYMETRIC_METRICS.items(): assert_true(metric(y_true, y_pred) != metric(y_pred, y_true), msg="%s seems to be symetric" % name) # Deprecated metrics with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False)) assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true))
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78)
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes classifiers = all_estimators(type_filter='classifier') X, y = make_blobs(random_state=12345) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78, "accuracy of %s not greater than 0.78" % str(Clf))
def test_classifiers_classes(): # test if classifiers can cope with non-consecutive classes classifiers = all_estimators(type_filter='classifier') X, y = make_blobs(random_state=12345) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) y = 2 * y + 1 classes = np.unique(y) # TODO: make work with next line :) #y = y.astype(np.str) for name, Clf in classifiers: if Clf in dont_test: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) # training set performance assert_array_equal(np.unique(y), np.unique(y_pred)) assert_greater(zero_one_score(y, y_pred), 0.78, "accuracy of %s not greater than 0.78" % str(Clf)) assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) n_samples, n_features = X.shape n_labels = len(np.unique(y)) X = Scaler().fit_transform(X) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # raises error on malformed input for fit assert_raises(ValueError, clf.fit, X, y[:-1]) # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) assert_equal(decision.shape, (n_samples, n_labels)) # raises error on malformed input assert_raises(ValueError, clf.decision_function, X.T) if not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_labels)) # raises error on malformed input assert_raises(ValueError, clf.predict_proba, X.T) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass
def test_classifier(classifier, trainData, trainLabel, testData, testLabel): classifier.fit(trainData, trainLabel) testPredicted = classifier.predict(testData) print 'Accuracy: ', metrics.zero_one_score(testLabel, testPredicted) print 'F1-score: ', metrics.f1_score(testLabel, testPredicted) print metrics.classification_report(testLabel, testPredicted) return classifier
def make_conf_mat(y_te, y_te_pr, type): conf_mat = metrics.confusion_matrix(y_te, y_te_pr) conf_mat_frac = conf_mat / np.sum(conf_mat, axis=0) print type, ' Accuracy: ', metrics.zero_one_score(y_te, y_te_pr) np.savetxt(os.path.join(class_dir, prefix+'_conf_'+type+'.csv'), conf_mat, fmt='%i', delimiter=',') np.savetxt(os.path.join(class_dir, prefix+'_conffr_'+type+'.csv'), conf_mat_frac, fmt = '%.6f', delimiter=',')
def main(argv): import scipy from sklearn import metrics from sklearn.multiclass import OneVsOneClassifier from sklearn.naive_bayes import GaussianNB from sklearn.cross_validation import cross_val_score from sklearn.svm import SVC from sklearn.neighbors import KNeighborsClassifier from sklearn.tree import DecisionTreeClassifier from sklearn import preprocessing import similarity class ScaledSVC(SVC): def _scale(self, data): return preprocessing.scale(data) def fit(self, X, Y): return super(ScaledSVC, self).fit(self._scale(X), Y) def predict(self, X): return super(ScaledSVC, self).predict(self._scale(X)) data, labels = scipy.loadtxt(argv[1]), scipy.loadtxt(argv[2]) if len(argv) > 3: features = np.array([int(s) for s in argv[3].split(',')]) data = data[:, features] def ovo(model, adj_strat): return OneVsOneClassifier(BinaryTiloClassifier(model, adj_strat)) classifiers = [ ('TILO/PRC/Gaussian', ovo(PinchRatioCutStrategy(), similarity.Gaussian())), ("TILO/Nearest/Gaussian", ovo(NearestCutStrategy(), similarity.Gaussian())), ("TILO/PRC/KNN", ovo(PinchRatioCutStrategy(), similarity.KNN())), ("TILO/Nearest/KNN", ovo(NearestCutStrategy(), similarity.KNN())), ("SVC", ScaledSVC()), ("Gaussian Naive Bayes", GaussianNB()), ("K Neighbors", KNeighborsClassifier()), ("Decision Tree", DecisionTreeClassifier())] format_str = '{:<30} {} {} {}' print '{:<30} {:<10} RAND Accuracy'.format('method', 'accuracy') for name, c in classifiers: scores = cross_val_score(c, data, labels, cv=5) #scores = np.array([1., 1.]) model = c.fit(data, labels) guesses = model.predict(data) acc = metrics.zero_one_score(guesses, labels) rand = metrics.adjusted_rand_score(guesses, labels) print '{:<30} {:.4f} +/- {:.4f} {: .4f} {:.4f}'.format(name, scores.mean(), scores.std() / 2, rand, acc)
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 13) assert_almost_equal(zero_one(y_true, y_pred, normalize=True), 13 / float(n_samples), 2) assert_almost_equal(zero_one_loss(y_true, y_pred), 13 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 13) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 13. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) assert_equal(accuracy_score(y_true, y_pred, normalize=False), n_samples - zero_one_loss(y_true, y_pred, normalize=False)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 12.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), -0.04, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), -0.04, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def test_losses(): """Test loss functions""" y_true, y_pred, _ = make_prediction(binary=True) n_samples = y_true.shape[0] n_classes = np.size(unique_labels(y_true)) # Classification # -------------- with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), 11) assert_almost_equal(zero_one(y_true, y_pred, normalize=True), 11 / float(n_samples), 2) assert_almost_equal(zero_one_loss(y_true, y_pred), 11 / float(n_samples), 2) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), 11) assert_almost_equal(zero_one_loss(y_true, y_true), 0.0, 2) assert_almost_equal(zero_one_loss(y_true, y_true, normalize=False), 0, 2) assert_almost_equal(hamming_loss(y_true, y_pred), 2 * 11. / (n_samples * n_classes), 2) assert_equal(accuracy_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) assert_equal(accuracy_score(y_true, y_pred, normalize=False), n_samples - zero_one_loss(y_true, y_pred, normalize=False)) with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), 1 - zero_one_loss(y_true, y_pred)) # Regression # ---------- assert_almost_equal(mean_squared_error(y_true, y_pred), 10.999 / n_samples, 2) assert_almost_equal(mean_squared_error(y_true, y_true), 0.00, 2) # mean_absolute_error and mean_squared_error are equal because # it is a binary problem. assert_almost_equal(mean_absolute_error(y_true, y_pred), 10.999 / n_samples, 2) assert_almost_equal(mean_absolute_error(y_true, y_true), 0.00, 2) assert_almost_equal(explained_variance_score(y_true, y_pred), 0.16, 2) assert_almost_equal(explained_variance_score(y_true, y_true), 1.00, 2) assert_equal(explained_variance_score([0, 0, 0], [0, 1, 1]), 0.0) assert_almost_equal(r2_score(y_true, y_pred), 0.12, 2) assert_almost_equal(r2_score(y_true, y_true), 1.00, 2) assert_equal(r2_score([0, 0, 0], [0, 0, 0]), 1.0) assert_equal(r2_score([0, 0, 0], [0, 1, 1]), 0.0)
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # Symmetric metric for metric in [accuracy_score, lambda y1, y2: accuracy_score(y1, y2, normalize=False), zero_one_loss, lambda y1, y2: zero_one_loss(y1, y2, normalize=False), hamming_loss, f1_score, matthews_corrcoef, mean_squared_error, mean_absolute_error]: assert_equal(metric(y_true, y_pred), metric(y_pred, y_true), msg="%s is not symetric" % metric) # Not symmetric metrics for metric in [precision_score, recall_score, lambda y1, y2: fbeta_score(y1, y2, beta=0.5), lambda y1, y2: fbeta_score(y1, y2, beta=2), explained_variance_score, r2_score]: assert_true(metric(y_true, y_pred) != metric(y_pred, y_true), msg="%s seems to be symetric" % metric) # Deprecated metrics with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False)) assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true))
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X, y = iris.data, iris.target X, y = shuffle(X, y, random_state=7) n_samples, n_features = X.shape n_labels = len(np.unique(y)) X = Scaler().fit_transform(X) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True) as w: clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) assert_equal(decision.shape, (n_samples, n_labels)) if not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_labels)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # Symmetric metric for metric in [ accuracy_score, lambda y1, y2: accuracy_score(y1, y2, normalize=False), zero_one_loss, lambda y1, y2: zero_one_loss(y1, y2, normalize=False), hamming_loss, f1_score, matthews_corrcoef, mean_squared_error, mean_absolute_error ]: assert_equal(metric(y_true, y_pred), metric(y_pred, y_true), msg="%s is not symetric" % metric) # Not symmetric metrics for metric in [ precision_score, recall_score, lambda y1, y2: fbeta_score(y1, y2, beta=0.5), lambda y1, y2: fbeta_score(y1, y2, beta=2), explained_variance_score, r2_score ]: assert_true(metric(y_true, y_pred) != metric(y_pred, y_true), msg="%s seems to be symetric" % metric) # Deprecated metrics with warnings.catch_warnings(record=True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False)) assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true))
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # symmetric assert_equal(accuracy_score(y_true, y_pred), accuracy_score(y_pred, y_true)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_almost_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False), 2) assert_equal(zero_one_loss(y_true, y_pred), zero_one_loss(y_pred, y_true)) assert_equal(zero_one_loss(y_true, y_pred, normalize=False), zero_one_loss(y_pred, y_true, normalize=False)) with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true)) assert_almost_equal(mean_squared_error(y_true, y_pred), mean_squared_error(y_pred, y_true)) assert_almost_equal(mean_absolute_error(y_true, y_pred), mean_absolute_error(y_pred, y_true)) # not symmetric assert_true(explained_variance_score(y_true, y_pred) != explained_variance_score(y_pred, y_true)) assert_true(r2_score(y_true, y_pred) != r2_score(y_pred, y_true))
def train_model(): ### Steps to get and store the tfidf values # vectorizer = TfidfVectorizer(stop_words='english', min_n=1, max_n=2, # smooth_idf=True, sublinear_tf=True, max_df=0.5) # train_data = vectorizer.fit_transform(generate_emails(training_filenames)) # test_data = vectorizer.transform(generate_emails(test_filenames)) # joblib.dump(train_data.tocsr(), 'train_data.joblib') # joblib.dump(test_data.tocsr(), 'test_data.joblib') # joblib.dump(self.train_target, 'train_target.joblib') # joblib.dump(self.test_target, 'test_target.joblib') ### train_data = joblib.load('train_data.joblib', mmap_mode='c') test_data = joblib.load('test_data.joblib', mmap_mode='c') train_target = joblib.load('train_target.joblib', mmap_mode='c') test_target = joblib.load('test_target.joblib', mmap_mode='c') ### Steps to select best features # print "Selecting K-best features by chi squared test" # start_time = time() # ch2 = SelectKBest(chi2, k=100) # train_data = ch2.fit_transform(train_data, train_target) # test_data = ch2.transform(test_data) # print "[Train data] n_samples: %d, n_features: %d" % train_data.shape # print "[Test data] n_samples: %d, n_features: %d" % test_data.shape # print "Done in %0.3fs" % (time() - start_time) ### if train_data.shape[0] == 0: print "train_data is empty. No vectors to train on." return None clf = LinearSVC() #SGDClassifier(n_iter=10, loss='modified_huber') print "Training %s" % (clf), start_time=time() clf.fit(train_data, train_target) train_time = time() - start_time print "Done in %0.3fs" % train_time print "Testing..." test_start = time() predicted = clf.predict(test_data) accuracy = zero_one_score(test_target, predicted) error_rate = 1 - accuracy test_time = time() - test_start print "Done in %0.3fs" % test_time print "Accuracy: ", numpy.mean(predicted == self.test_target) print "Z1 Accuracy: ", accuracy
def evaluate(self, test_file, encoding='UTF-8', classif_file=None): """\ Evaluate on the given test data file. Return accuracy. If classif_file is set, save the classification results to this file. """ test = DataSet() test.load_from_arff(test_file, encoding) values = self.classify(test) golden = self.get_classes(test, dtype=None) if classif_file is not None: classif = DataSet() classif.load_from_vect(test.get_attrib(self.class_attr), values) classif.rename_attrib(self.class_attr, self.PREDICTED) test.merge(classif) test.save_to_arff(classif_file, encoding) return zero_one_score(golden, values)
def roi_svc_model_0(X_train, y_train, X_test, y_test): """ An instance of multi-classes classifier -- model-0. Return the predict accuracy. """ # data preprocessing y_train_bin = y_train.copy() y_train_bin[y_train_bin != 0] = 1 scaler = preprocessing.Scaler().fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # train the binary-classes classifier bin_svc = svm.SVC(C=1, kernel='rbf', cache_size=1000, class_weight='auto') bin_svc.fit(X_train, y_train_bin) # train the multi-classes classifier labeled_sample_idx = [idx for idx in range(y_train.shape[0]) if y_train[idx] != 0] X_train_mul = X_train[labeled_sample_idx, :] y_train_mul = y_train[labeled_sample_idx] mul_label_list = np.unique(y_train_mul) mul_label_list = mul_label_list.tolist() print mul_label_list mul_svc = svm.SVC(C=1, kernel='rbf', cache_size=1000, class_weight='auto') mul_svc.fit(X_train_mul, y_train_mul) # test the classifier using an independent dataset y_predict_bin = bin_svc.predict(X_test) selected_sample_idx = [idx for idx in range(y_predict_bin.shape[0]) if y_predict_bin[idx] != 0] y_predict_mul = mul_svc.predict(X_test[selected_sample_idx, :]) # calculate the predict score y_predict = np.zeros((y_test.shape[0])) y_predict[selected_sample_idx] = y_predict_mul score = metrics.zero_one_score(y_test, y_predict) precision = metrics.precision_score(y_test, y_predict, labels=mul_label_list, pos_label=None) recall = metrics.recall_score(y_test, y_predict, labels=mul_label_list, pos_label=None) return score, precision, recall
def C_and_gamma_evaluation(X_tr, y_tr, X_cv, y_cv, classifier_by_C_and_gamma_function, error_measure_function, C, idx_C, gamma, idx_gamma): classifier = classifier_by_C_and_gamma_function(X_tr, y_tr, C=C, gamma=gamma) tr_err, cv_err = error_measure_function(classifier,X_tr,y_tr,X_cv,y_cv) y_pred=classifier.predict(X_cv) if hasattr(metrics,"accuracy_score"): acc = metrics.accuracy_score(y_cv,y_pred) else: assert hasattr(metrics,"zero_one_score") acc = metrics.zero_one_score(y_cv, y_pred) prec=metrics.precision_score(y_cv,y_pred) recall=metrics.recall_score(y_cv,y_pred) f1_score=metrics.f1_score(y_cv,y_pred) return idx_C, idx_gamma, tr_err, cv_err, acc, prec, recall, f1_score
def C_evaluation(X_tr, y_tr, X_cv, y_cv, classifier_by_C_function, error_measure_function, C, idx_C): classifier = classifier_by_C_function(X_tr, y_tr,C=C) tr_err, cv_err = error_measure_function(classifier,X_tr,y_tr,X_cv,y_cv) #it is assumed that we are dealing with a sklearn classifier... y_pred = classifier.predict(X_cv) if hasattr(metrics,"accuracy_score"): acc = metrics.accuracy_score(y_cv,y_pred) else: assert hasattr(metrics,"zero_one_score") acc = metrics.zero_one_score(y_cv, y_pred) prec=metrics.precision_score(y_cv,y_pred) recall=metrics.recall_score(y_cv,y_pred) f1_score=metrics.f1_score(y_cv,y_pred) return idx_C, tr_err, cv_err, acc, prec, recall, f1_score
def performance_estimation(self, X, y, kernel = SVM_RBF, C = 1.0, gamma = None, n_iterations = 20, test_size = 0.3): assert isinstance(C,(int,float)) set_ripartitions = StratifiedShuffleSplit(y, n_iter = n_iterations, test_size = test_size, indices = False) if kernel == SVM_linear: classifier = LinearSVC(C=C, class_weight = 'auto') elif kernel == SVM_RBF: assert isinstance(gamma,(int,float)) classifier = SVC(kernel="rbf", C=C, gamma=gamma, class_weight = 'auto') elif kernel == SVM_RBF_Chi2_squared: classifier = SVC(kernel=chi2_kernel,C=C, class_weight = 'auto') accuracy_avg = 0.0 precision_avg = 0.0 recall_avg = 0.0 f1_score_avg = 0.0 for train,test in set_ripartitions: X_tr,X_cv,y_tr,y_cv =X[train],X[test],y[train],y[test] classifier.fit(X_tr, y_tr) y_pred=classifier.predict(X_cv) if hasattr(metrics,"accuracy_score"): acc = metrics.accuracy_score(y_cv,y_pred) else: assert hasattr(metrics,"zero_one_score") acc = metrics.zero_one_score(y_cv, y_pred) prec=metrics.precision_score(y_cv,y_pred) recall=metrics.recall_score(y_cv,y_pred) f1_score=metrics.f1_score(y_cv,y_pred) accuracy_avg = accuracy_avg + acc / n_iterations precision_avg = precision_avg + prec / n_iterations recall_avg = recall_avg + recall / n_iterations f1_score_avg = f1_score_avg + f1_score / n_iterations return accuracy_avg, precision_avg, recall_avg, f1_score_avg
def _evaluatePredictions(self,report,predictions,groundtruth): predicted_labels = 1*(predictions>.5) fpr, tpr, thresholds = roc_curve(groundtruth, predictions) roc_auc = auc(fpr, tpr) plt.clf() plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) plt.plot([0, 1], [0, 1], 'k--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic example') plt.legend(loc="lower right") report.title('Summary',level=2) accuracy = zero_one_score(groundtruth,predicted_labels) report.text('Global accuracy = %.1f%%'%(100.0*accuracy)) report.title('ROC Curve',level=2) report.plot() plt.close() report.title('Short report',level=2) report.pre(classification_report(groundtruth,predicted_labels)) report.title('Confusion matrix',level=2) report.table(['Ground truth','0','1']) confusion = confusion_matrix(groundtruth,predicted_labels) for k in xrange(2): report.row([k,confusion[k][0],confusion[k][1]]) report.close()
def classifier(test='50_categories',target='validation',validate=False,quick=False): """classifier(test='test/',target='target/',validate=False,verbose=False): function that attempts to classify a set of images based on a set of learning images, using various image features. If validate=False, classifier uses half the testing set as a target set, and outputs statistics on its performance. If validate=True, classifier uses the entire test directory to train on, and outputs a list of the target files with predicted classification. If quick=True, classifier uses only the first 5 images in each category to train on. Note on paths: training path expects a directory with subdirectories named with the category label, and validation path expects a directory of images""" print '%CLASSIFIER: Reading Input Files' # status update test = test+'/' target = target+'/' # read in files, store data and categories dir=test subdir=os.listdir(dir) for element in subdir: if element[0]=='.': subdir.remove(element) # eliminate non-image entries images=[] # store images categories=[] # store name of categories for each image catint=[] # store integer designation of category for each image filens=[] # store names of files i=-1 for cat in subdir: # loop over directories i+=1 files=os.listdir(dir+cat) if quick==True: # speed-learning: 5 images per category files=files[:4] for element in files: if element[0]=='.': files.remove(element) # eliminate non-image entries categories.append(cat) for item in files: # look over files catint.append(i) images.append(np.flipud(plt.imread(dir+cat+'/'+item))) filens.append(item) # repeat for testing set, if different from target set if validate == True: valdir=target valimages=[] valfilens = os.listdir(valdir) for item in valfilens: if item[0]=='.': valfilens.remove(item) for item in valfilens: valimages.append(np.flipud(plt.imread(valdir+item))) # names of features for class Featured featnames=['Total Number of Pixels','Aspect Ratio','Median Number of Edges',\ 'Fraction of Color in Red','Fraction of Color in Green',\ 'Fraction of Color in Green','Fraction of Edges in Vertical Orientation',\ 'Skewness in Red Channel','Variation in Blue Channel','Brightness Centering',\ 'Correlation Between Red and Blue Channels',\ 'Correlation Between Red and Green Channels',\ 'Correlation Between Blue and Green Channels','Number of Connected Bright Objects',\ 'Number of Connected Dim Objects'] nim = len(catint) # number of images nfeat = len(featnames) # number of features features = np.zeros((nim,nfeat)) # store feature values for each image statuspoints = np.linspace(0,nim,11) # for status update calculations statuspoints = [np.floor(pt) for pt in statuspoints] if validate == True: nval = len(valfilens) # number of validation images valfeatures = np.zeros((nval,nfeat)) # store feature values for validation images # calculate features for images in training set print '%CLASSIFIER: Calculating Features' # status update statusn = 0 for i in range(nim): # STATUS UPDATE if i in statuspoints: # print status update about every 10% complete print '%CLASSIFIER: '+theraven(statusn)+' ['+str(i*100/nim)+'%]' statusn+=1 im = Featured(images[i]) features[i,0]=im.ncountpix() features[i,1]=im.aspect() features[i,2]=im.mededges() features[i,3]=im.redfrac() features[i,4]=im.greenfrac() features[i,5]=im.bluefrac() features[i,6]=im.vedges() features[i,7]=im.redskew() features[i,8]=im.bluevar() features[i,9]=im.centered() features[i,10]=im.rgcor() features[i,11]=im.rbcor() features[i,12]=im.bgcor() features[i,13]=im.nbright() features[i,14]=im.ndim() # calculate features for images in validation set if validate == True: print '%CLASSIFIER: Calculating Validation Set Features' for i in range(nval): im = Featured(valimages[i]) valfeatures[i,0]=im.ncountpix() valfeatures[i,1]=im.aspect() valfeatures[i,2]=im.mededges() valfeatures[i,3]=im.redfrac() valfeatures[i,4]=im.greenfrac() valfeatures[i,5]=im.bluefrac() valfeatures[i,6]=im.vedges() valfeatures[i,7]=im.redskew() valfeatures[i,8]=im.bluevar() valfeatures[i,9]=im.centered() valfeatures[i,10]=im.rgcor() valfeatures[i,11]=im.rbcor() valfeatures[i,12]=im.bgcor() valfeatures[i,13]=im.nbright() valfeatures[i,14]=im.ndim() # building testing and target sets if validate==False: testim = images[::2] targim = images[1::2] testfeat = features[::2,:] targfeat = features[1::2,:] targfiles = filens[1::2] testcat = catint[::2] targcat = catint[1::2] else: testim = images targim = valimages testfeat = features targfeat = valfeatures targfiles = valfilens testcat = catint # build random forest print '%CLASSIFIER: Building Random Forest' # status update rfc = RandomForestClassifier(compute_importances=True) rfc = rfc.fit(testfeat,testcat) impt = rfc.feature_importances_ pred = rfc.predict(targfeat) # predicted categories for target images ncat=max(catint) if validate == False: randpred = [] # predictions for targcat based on random guessing for i in range(len(targim)): randpred.append(random.randint(0,ncat+1)) score = metrics.zero_one_score(targcat, pred) # zero-one score randscore = metrics.zero_one_score(targcat, randpred) # zero-one score for random guessing # outputs print 'Three Most Important Features:' for ifeat in range(3): maxind = np.where(impt == np.max(impt)) impt[maxind[0]]=0 print str(ifeat+1)+'. '+featnames[maxind[0]] print str(int(score*100))+'% Good Predictions from Random Forest' print str(int(randscore*100))+'% Good Predictions from Random Guessing' else: print 'filename\t\tpredicted_class' print '-'*50 for i in range(len(targfiles)): name = targfiles[i]+' '*(30-len(targfiles[i])) print name+'\t'+categories[pred[i]]
x = {"first": name[0], "first2": name[:2], "first3": name[:3], "last": name[-1], "last2": name[-2:], "last3": name[-3:]} for c in "abcdefghijklmnopqrstuvwzyx": x["count(%s)" % c] = name.count(c) return x dv = DictVectorizer() X = dv.fit_transform(gender_features(n) for n in names) # TODO scale/center X X = X.tocsr() print("%d samples, %d features\n" % X.shape) y = np.array([0] * len(female_names) + [1] * len(male_names)) # Instead of splitting our data into training and test sets, # we perform 10-fold cross validation. for clf in (BernoulliNB(), LinearSVC()): print("Training and testing %r" % clf) for i, (train, test) in enumerate(StratifiedKFold(y, k=10)): clf.fit(X[train], y[train]) y_pred = clf.predict(X[test]) acc = zero_one_score(y[test], y_pred) print(" Fold: %d Accuracy: %.2f%%" % (i, acc * 100))
def rfe_curves(self, X, y): num_samples,num_features = X.shape tr_err_rfe = np.zeros(num_features) cv_err_rfe = np.zeros(num_features) accuracy_rfe = np.zeros(num_features) recall_rfe = np.zeros(num_features) precision_rfe = np.zeros(num_features) f1_score_rfe = np.zeros(num_features) for i in xrange(num_features): mask = np.zeros(num_features) mask[:i+1] = 1 new_mask = np.tile(mask==1,(num_samples,1)) extracted_X = X[new_mask] extracted_X = np.reshape(extracted_X,(num_samples,i+1)) set_ripartitions = StratifiedShuffleSplit(y, n_iter = self.n_iterations, test_size = self.test_size, indices=False) n_iter = len(set_ripartitions) for train,test in set_ripartitions: X_tr,X_cv,y_tr,y_cv =extracted_X[train],extracted_X[test],y[train],y[test] if self.kernel == SVM_RBF: classifier = SVM_RBF_by_C_and_gamma_function(X_tr, y_tr, C=self.C, gamma=self.gamma) tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv) elif self.kernel == SVM_linear: classifier = linear_SVM_by_C_function(X_tr, y_tr, C=self.C) tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv) elif self.kernel == SVM_RBF_Chi2_squared: classifier = SVM_RBF_Chi2_squared_by_C_function(X_tr, y_tr, C = self.C) tr_err, cv_err = misclassification_errors(classifier,X_tr,y_tr,X_cv,y_cv) y_pred=classifier.predict(X_cv) if hasattr(metrics,"accuracy_score"): acc = metrics.accuracy_score(y_cv,y_pred) else: assert hasattr(metrics,"zero_one_score") acc = metrics.zero_one_score(y_cv, y_pred) prec=metrics.precision_score(y_cv,y_pred) recall=metrics.recall_score(y_cv,y_pred) f1_score=metrics.f1_score(y_cv,y_pred) tr_err_rfe[i] = tr_err_rfe[i] + tr_err / n_iter cv_err_rfe[i] = cv_err_rfe[i] + cv_err / n_iter accuracy_rfe[i] = accuracy_rfe[i] + acc / n_iter recall_rfe[i] = recall_rfe[i] + recall / n_iter precision_rfe[i] = precision_rfe[i] + prec / n_iter f1_score_rfe[i] = f1_score_rfe[i] + f1_score / n_iter return tr_err_rfe, cv_err_rfe,accuracy_rfe,recall_rfe, precision_rfe, f1_score_rfe
def main(): clf=joblib.load('svc_wordnet.pkl') feature_index={} with open('data.p', 'rb') as fp: feature_index = pickle.load(fp) y_true=[] y_pred=[] X=[] fpath=open(os.getcwd()+'/testing_set_path.txt') f = open('DMOZ_chi2_testing.txt','w') class_no=0 for line in fpath.read().split('\n'): print line path =line if path == '': break for file in glob.glob(os.path.join(path, '*.txt')): #print file mapping = [0]*1408 for word in open(file).read().split(): if len( word ) < 2: continue index=feature_index.get(word) #print index if(index is not None): mapping[index]=1 else: for ss in wn.synsets(word): for l in ss.lemmas(): index = feature_index.get(l.name) if(index is not None): mapping[index]=1 break list = [] for syn_set in wn.synsets(word): for syn in syn_set.lemmas(): list.append(syn.name) for w in list: index = feature_index.get(w) if(index is not None): mapping[index]=1 break X.append(mapping) y_true.append(class_no) f.write(str(class_no)) f.write(" ") for m in mapping: f.write(" ".join(str(m))+ " ") f.write('\n') y_pred.append(int(clf.predict(mapping))) class_no=class_no+1 f.close() with open('testing_X.p','wb') as fp: pickle.dump(X, fp) with open('testing_y_true.p','wb') as fp: pickle.dump(y_true, fp) with open('testing_y_pred.p','wb') as fp: pickle.dump(y_pred, fp) #print y_true #print y_pred target_names = ['Arts', 'Business', 'Computers','Games','Health','Home','News','Recreation','Reference','Regional','Science','Shopping','Society','Sports'] print(classification_report(y_true, y_pred, target_names=target_names)) accuracy = zero_one_score(y_true, y_pred) print 'accuracy',accuracy print metrics.precision_score(y_true, y_pred, average='macro') print metrics.recall_score(y_true, y_pred, average='micro') print metrics.f1_score(y_true, y_pred, average='weighted') f = open('Result_class_bns.txt', 'w') f.writelines((classification_report(y_true, y_pred, target_names=target_names))) f.close()
from sklearn.datasets import load_digits from sklearn.svm import SVC from sklearn.utils import shuffle from sklearn.metrics import zero_one_score import numpy as np digits = load_digits() X, y = shuffle(digits.data, digits.target) X_train, X_test = X[:1000, :], X[1000:, :] y_train, y_test = y[:1000], y[1000:] svc = SVC(kernel='precomputed') kernel_train = np.dot(X_train, X_train.T) # linear kernel svc.fit(kernel_train, y_train) #kernel_test = np.dot(X_test, X_train[svc.support_, :].T) kernel_test = np.dot(X_test, X_train.T) y_pred = svc.predict(kernel_test) print(zero_one_score(y_test, y_pred))
std = X.std(axis=0) X = (X - mean) / std for clf, name in ( (SGDClassifier(n_iter=100, alpha=0.01), "plain sgd"), (SGDClassifier(n_iter=100, alpha=0.01, class_weight={1: 10}), "weighted sgd"), (SGDRanking(n_iter=1000, alpha=0.01, loss='roc_pairwise_ranking'), "pairwise sgd"), (RankSVM(n_iter=100, alpha=0.01, loss='hinge'), 'RankSVM'), ): clf.fit(X, y) print clf pred = clf.predict(X) print "ACC: %.4f" % metrics.zero_one_score(y, pred) print "AUC: %.4f" % metrics.auc_score(y, pred) print "CONFUSION MATRIX: " print metrics.confusion_matrix(y, pred) print "Kendall Tau: %.4f" % kendalltau(clf, X, y) print 80 * '=' clf = MinirankSVM(max_iter=100, alpha=0.01).fit(X, y) print clf scores = np.dot(X, clf.coef_) pred = (scores > 0).astype(np.int) print "ACC: %.4f" % metrics.zero_one_score(y, pred) print "AUC: %.4f" % metrics.auc_score(y, pred) print "CONFUSION MATRIX: " print metrics.confusion_matrix(y, pred) print "Kendall Tau: %.4f" % kendalltau(clf, X, y)
Xte, Yte, class_map, feature_names, test_image_names = pickle.load( open('testing_set.p', 'r')) print 'opening saved test set features...' except: print 'calculating test set features...' # need to pull class_map and feature_names from training set Xtr, Ytr, class_map, feature_names, ignore = pickle.load( open('training_set.p', 'r')) # now, calculate all of the features for the testing set from feature_calc import calculate_features Xte, Yte, ignore, ignore, test_image_names = calculate_features( 'validation_images', 'testing_set.p', class_map=class_map) print 'predicting the classes of verification images...' pred = clf.predict(Xte) rfor_01_score = metrics.zero_one_score(Yte, pred) # zero-one score print "Zero-One Score: " + str(rfor_01_score) # create and save the confusion matrix confmat = metrics.confusion_matrix(Yte, pred) plt.close("all") plt.imshow(confmat, interpolation="nearest", origin="upper") plt.savefig("confusion_matrix.pdf") plt.close("all") # show the feature importances print "Summary of feature importances" for n in range(len(feature_names)): print "\t", round(clf.feature_importances_[n], 4), feature_names[n] # reverse the class_map to get the names for each category
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X_m, y_m = iris.data, iris.target X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # raises error on malformed input for fit assert_raises(ValueError, clf.fit, X, y[:-1]) # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples,)) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples,)) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if n_classes is 3 and not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, clf.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal( np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, clf.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass if hasattr(clf, "classes_"): if hasattr(clf, "n_outputs_"): assert_equal(clf.n_outputs_, 1) assert_array_equal( clf.classes_, [classes], "Unexpected classes_ attribute for %r" % clf) else: # flat classes array: XXX inconsistent assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def train(self,trainset,labels,config,val_set,val_labels): # TODO: adaptive learning rate n,dim = trainset.shape max_iter = config.iterations batchsize = config.batchsize learning_rate = config.learning_rate momentum = config.momentum weight_decay = config.weight_decay learning_rates = [0.1,0.2] numbatches = n / batchsize self.layerstates = [np.array([]) for layersize in self.layersizes] tiny = np.ones((batchsize,self.n_classes)) * 0.000001 val_error = 1000 best_val_error = 1000 best_weights = [] if val_set != None: tiny_val = np.ones((val_labels.shape[0],self.n_classes)) * 0.000001 for iteration in range(max_iter): print 'Iteration ' + str(iteration+1) iteration_error = 0 t0 = time.clock() for j in range(numbatches): # get training batch and targets batch = trainset[j*batchsize:(j+1)*batchsize,:] target = labels[j*batchsize:(j+1)*batchsize,:] # forward propagation output = self.forward_pass(batch,config) batch_error = 0 if config.error == 'cross-entropy': self.w_delta[-1] = output - target batch_error = -np.sum(np.sum(np.multiply(target,np.log(output+tiny)),1)) / batchsize else: if config.nonlinearity == 'sigmoid': deriv = np.multiply(output,np.ones(output.shape) - output) elif config.nonlinearity == 'tanh': deriv = np.ones(output.shape) - np.square(output) self.w_delta[-1] = np.multiply(target - output,deriv) iteration_error += batch_error # backpropagation: compute deltas for i in range(self.n_layers-2,-1,-1): out = self.layerstates[i] if config.nonlinearity == 'sigmoid': deriv = np.multiply(out,np.ones(out.shape) - out) elif config.nonlinearity == 'tanh': deriv = np.ones(out.shape) - np.square(out) tmp = self.w_delta[i+1] * np.transpose(self.weights[i][:-1,:]) self.w_delta[i] = np.multiply(tmp,deriv) # compute derivatives from deltas # update weights for i in range(self.n_layers - 1): activations = np.append(self.layerstates[i],np.ones((batchsize,1)),1) w_deriv = np.transpose(activations) * self.w_delta[i+1] / batchsize \ + momentum * self.w_deriv_old[i] regularizer = weight_decay * self.weights[i] # no regularization on bias regularizer[-1,:] = np.zeros((1,self.weights[i].shape[1])) self.weights[i] = self.weights[i] - learning_rates[i] * w_deriv \ - regularizer self.w_deriv_old[i] = w_deriv iteration_error /= numbatches if val_set != None: res = self.predict(val_set,config) predicted = self.label_converter.transform(res) groundtruth = self.label_converter.transform(val_labels) val_error_old = val_error val_error = -np.sum(np.sum(np.multiply(groundtruth,np.log(predicted+tiny_val)),1)) \ / groundtruth.shape[0] if val_error < best_val_error: best_val_error = val_error best_weights = self.weights print 'Accuracy on val: ' + str(zero_one_score(val_labels,res)) if val_error > val_error_old + 0.15: self.weights = best_weights print 'Early stop: ' + str(val_error) break for i in range(self.n_layers - 1): learning_rates[i] = 0.9 * learning_rates[i] self.log_file.write('Iteration ' + str(iteration+1) + '\n') self.log_file.write(' Error train: ' + str(iteration_error) + '\n') self.log_file.write(' Error val: ' + str(val_error) + '\n') self.log_file.write(' Time for iteration: ' + str(time.clock() - t0) + '\n') self.log_file.flush() print 'Errors: ' + str(iteration_error / numbatches) + ',' + str(val_error)
def main(): clf=joblib.load('svc_wordnet.pkl') feature_index={} with open('data.p', 'rb') as fp: feature_index = pickle.load(fp) y_true=[] y_pred=[] X=[] fpath=open(os.getcwd()+'/testing_set_path.txt') f = open("20NG_ig_test.txt", "w") class_no=0 for line in fpath.read().split('\n'): print line path =line if path == '': break for file in glob.glob(os.path.join(path, '*')): #print file mapping = [0]*2260 for word in open(file).read().split(): index=feature_index.get(word) #print index if(index is not None): mapping[index]=1 else: for ss in wn.synsets(word): for l in ss.lemmas(): index = feature_index.get(l.name) if(index is not None): mapping[index]=1 break list = [] for syn_set in wn.synsets(word): for syn in syn_set.lemmas(): list.append(syn.name) for w in list: index = feature_index.get(w) if(index is not None): mapping[index]=1 break X.append(mapping) y_true.append(class_no) y_pred.append(int(clf.predict(mapping))) f.write(str(class_no)) f.write(" ") for m in mapping: f.write(str(m)) f.write(" ") f.write('\n') class_no=class_no+1 f.close() with open('testing_X.p','wb') as fp: pickle.dump(X, fp) with open('testing_y_true.p','wb') as fp: pickle.dump(y_true, fp) with open('testing_y_pred.p','wb') as fp: pickle.dump(y_pred, fp) #print y_true #print y_pred target_names = ['Alt', 'Computers', 'Miscellaneous','Rec','Science','Social','Talk'] print(classification_report(y_true, y_pred, target_names=target_names)) accuracy = zero_one_score(y_true, y_pred) print 'accuracy',accuracy print metrics.precision_score(y_true, y_pred, average='macro') print metrics.recall_score(y_true, y_pred, average='micro') print metrics.f1_score(y_true, y_pred, average='weighted')
trainlabels = labels[:n_train] #valset = records[n_train:,:] #vallabels = labels[n_train:,:] valset = records[n_train:n_train+n_val,:] vallabels = labels[n_train:n_train+n_val] n,dim = trainset.shape # mean centering, stdev normalization and whitening scaler = Scaler() scaler.fit(trainset) trainset = scaler.transform(trainset) valset = scaler.transform(valset) pca = PCA(n_components=dim,whiten=True) pca.fit(trainset) trainset = pca.transform(trainset) valset = pca.transform(valset) config = Train_config() config.iterations = 10 config.nonlinearity = 'tanh' config.batchsize = 50 config.learning_rate = 0.2 config.momentum = 0.7 log = open('log.txt','w') nn = Net([dim,300,10],log_file=log) nn.fit(trainset,trainlabels,config,val_set=valset,val_labels=vallabels) nn_file = open('nn.obj','w') pickle.dump(nn.weights,nn_file) results = nn.predict(valset,config) print zero_one_score(vallabels,results)
def calculate_testing_accuracy(self, Y, predict): redundancy = sum(self.Y) * 1.0 / len(self.Y) accuracy = zero_one_score(Y, predict) precision = precision_score(Y, predict) recall = recall_score(Y, predict) f1 = f1_score(Y, predict)
# flag = False # return flag if decadecount[1]<cap or decadecount[2]<cap or decadecount[3]<cap : flag = False return flag cap = 40 #sys.stdout = open("output.txt", "w") traindir = "C:\Users\gouthamdl\Desktop\data" segments,csegments = getclusters(traindir) #n_samples, n_features = segments.shape #print segments #print 'csegments' #print csegments print 'Performing Clustering' estimator = KMeans(init='k-means++', n_clusters=50, n_init=1) cestimator = KMeans(init='k-means++', n_clusters=50, n_init=1) kmeans = estimator.fit(segments) ckmeans = estimator.fit(csegments) features,labels = buildfeatures(traindir,kmeans,ckmeans,ext='.h5') features = array(features) labels = array(labels) X_train, X_test, y_train, y_test = cross_validation.train_test_split(features, labels, test_size=0.1, random_state=0) print 'Performing Classification' clf2 = LogisticRegression().fit(X_train,y_train) dec_pred2 = clf2.predict(X_test) accuracy = zero_one_score(y_test, dec_pred2) print 'Accuracy with Logistic Regression : ' + str(accuracy) #for x,y in zip(y_test,dec_pred2): # print 'Actual Decade : ' + str(x) + ' Predicted Decade : ' + str(y)
def train_classifier(self): self.classifer.fit(self.train_features, self.train_labels) self.predicted_train_labels = self.classifer.predict(self.train_features) self.train_accuracy = sklearn_metrics.zero_one_score(self.train_labels, self.predicted_train_labels)
X = Scaler().fit_transform(X) for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(X_pred.shape[0], n_samples) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for transform assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) assert_equal(decision.shape, (n_samples, n_labels)) if not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass
def test_classifier(self): self.predicted_test_labels = self.classifer.predict(self.test_features) self.test_accuracy = sklearn_metrics.zero_one_score(self.test_labels, self.predicted_test_labels)
hits = [ np.all(y_true[np.where(groups == this_group)] == y_pred[np.where(groups == this_group)]) for this_group in np.unique(groups) ] return np.mean(hits) def all_or_nothing_contig(y_true, y_pred, groups): matches = 0 n_groups = 0 is_good = False for k, (this_y_true, this_y_pred) in enumerate(zip(y_true, y_pred)): if groups[k] != groups[k - 1]: n_groups += 1 matches += is_good is_good = True if this_y_true != this_y_pred: is_good = False matches += is_good return (matches * 1.0) / n_groups # what proportion of candidate hyphens were predicted correctly? print zero_one_score(y_true, y_pred) # 0.973684210526 # what proportion of words did we get completely right? print all_or_nothing_score(y_true, y_pred, groups) # 0.75 print all_or_nothing_contig(y_true, y_pred, groups)
hits = [ np.all(y_true[np.where(groups == this_group)] == y_pred[np.where( groups == this_group)]) for this_group in np.unique(groups) ] return np.mean(hits) def all_or_nothing_contig(y_true, y_pred, groups): matches = 0 n_groups = 0 is_good = False for k, (this_y_true, this_y_pred) in enumerate(zip(y_true, y_pred)): if groups[k] != groups[k - 1]: n_groups += 1 matches += is_good is_good = True if this_y_true != this_y_pred: is_good = False matches += is_good return (matches * 1.0) / n_groups # what proportion of candidate hyphens were predicted correctly? print zero_one_score(y_true, y_pred) # 0.973684210526 # what proportion of words did we get completely right? print all_or_nothing_score(y_true, y_pred, groups) # 0.75 print all_or_nothing_contig(y_true, y_pred, groups)
def test_classifiers_train(): # test if classifiers do something sensible on training set # also test all shapes / shape errors estimators = all_estimators() classifiers = [(name, E) for name, E in estimators if issubclass(E, ClassifierMixin)] iris = load_iris() X_m, y_m = iris.data, iris.target X_m, y_m = shuffle(X_m, y_m, random_state=7) X_m = StandardScaler().fit_transform(X_m) # generate binary problem from multi-class one y_b = y_m[y_m != 2] X_b = X_m[y_m != 2] for (X, y) in [(X_m, y_m), (X_b, y_b)]: # do it once with binary, once with multiclass classes = np.unique(y) n_classes = len(classes) n_samples, n_features = X.shape for name, Clf in classifiers: if Clf in dont_test or Clf in meta_estimators: continue if Clf in [MultinomialNB, BernoulliNB]: # TODO also test these! continue # catch deprecation warnings with warnings.catch_warnings(record=True): clf = Clf() # raises error on malformed input for fit assert_raises(ValueError, clf.fit, X, y[:-1]) # fit clf.fit(X, y) y_pred = clf.predict(X) assert_equal(y_pred.shape, (n_samples, )) # training set performance assert_greater(zero_one_score(y, y_pred), 0.78) # raises error on malformed input for predict assert_raises(ValueError, clf.predict, X.T) if hasattr(clf, "decision_function"): try: # decision_function agrees with predict: decision = clf.decision_function(X) if n_classes is 2: assert_equal(decision.ravel().shape, (n_samples, )) dec_pred = (decision.ravel() > 0).astype(np.int) assert_array_equal(dec_pred, y_pred) if n_classes is 3 and not isinstance(clf, BaseLibSVM): # 1on1 of LibSVM works differently assert_equal(decision.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(decision, axis=1), y_pred) # raises error on malformed input assert_raises(ValueError, clf.decision_function, X.T) # raises error on malformed input for decision_function assert_raises(ValueError, clf.decision_function, X.T) except NotImplementedError: pass if hasattr(clf, "predict_proba"): try: # predict_proba agrees with predict: y_prob = clf.predict_proba(X) assert_equal(y_prob.shape, (n_samples, n_classes)) assert_array_equal(np.argmax(y_prob, axis=1), y_pred) # check that probas for all classes sum to one assert_array_almost_equal(np.sum(y_prob, axis=1), np.ones(n_samples)) # raises error on malformed input assert_raises(ValueError, clf.predict_proba, X.T) # raises error on malformed input for predict_proba assert_raises(ValueError, clf.predict_proba, X.T) except NotImplementedError: pass if hasattr(clf, "classes_"): if hasattr(clf, "n_outputs_"): assert_equal(clf.n_outputs_, 1) assert_array_equal( clf.classes_, [classes], "Unexpected classes_ attribute for %r" % clf) else: # flat classes array: XXX inconsistent assert_array_equal( clf.classes_, classes, "Unexpected classes_ attribute for %r" % clf)
def train_classifier(self): self.classifer.fit(self.train_features, self.train_labels) self.predicted_train_labels = self.classifer.predict( self.train_features) self.train_accuracy = sklearn_metrics.zero_one_score( self.train_labels, self.predicted_train_labels)
# Test for 10 rounds using the results from 10 fold cross validations for train_index, test_index in kf: X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) train_time = time() - t0 pred = clf.predict(X_test) test_time = time() - t0 # metrics f1_score = metrics.f1_score(y_test, pred) acc_score = metrics.zero_one_score(y_test, pred) pre_score = metrics.precision_score(y_test, pred) rec_score = metrics.recall_score(y_test, pred) f1_all += f1_score acc_all += acc_score pre_all += pre_score rec_all += rec_score f1_all = f1_all/num_fold acc_all = acc_all/num_fold pre_all = pre_all/num_fold rec_all = rec_all/num_fold print print clf print "average f1-score: %0.5f" % f1_all
def test_classifier(self): self.predicted_test_labels = self.classifer.predict(self.test_features) self.test_accuracy = sklearn_metrics.zero_one_score( self.test_labels, self.predicted_test_labels)