def train_classifier(clf,X_train,y_train,X_test,y_test): clf = OneVsOneClassifier(clf) clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() return clf
def gen_svc(train_model): '''Given a training model, generates the SVM (and DictVectorizer) for it Args: train_model: a training model object. should have 2 attributes: feature_lists, a map from POS tag to a dictionary of features (the ones used in the ith decision), and action_lists, a map from POS tag to the action (Shift, Left, Right) chosen for the ith decision Returns: dictionary mapping POS tag to a vectorizer, SVM tuple Raises: None ''' models = {} for pos_tag in train_model.feature_lists: vec = DictVectorizer() feature_mat = vec.fit_transform(train_model.feature_lists[pos_tag]) trained_svc = OneVsOneClassifier(LinearSVC()) try: trained_svc.fit(feature_mat, np.array(train_model.action_lists[pos_tag])) except ValueError: # occasionally we get the same action for everything with a # particular POS, which raises an error. so in that case we just # use a custom class that always predicts the same action trained_svc = AlwaysPredict(train_model.feature_lists[pos_tag][0]) models[pos_tag] = (vec, trained_svc) return models
def svm_training(train_X,train_Y,kernel): if kernel == False: clf = OneVsOneClassifier(svm.LinearSVC(random_state=0)) else: clf = OneVsOneClassifier(svm.SVC(kernel='rbf')) clf.fit(train_X,train_Y) return clf
def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65)
def gen_svc(train_model): '''Given a training model, generates the SVM (and DictVectorizer) for it''' vec = DictVectorizer() feature_mat = vec.fit_transform(train_model.feature_list) # for some reason just SVC() seems to always suggest "Shift" trained_svc = OneVsOneClassifier(LinearSVC()) trained_svc.fit(feature_mat, np.array(train_model.action_list)) return vec, trained_svc
def test_ovo_fit_on_list(): # Test that OneVsOne fitting works with a list of targets and yields the # same output as predict from an array ovo = OneVsOneClassifier(LinearSVC(random_state=0)) prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data) prediction_from_list = ovo.fit(iris.data, list(iris.target)).predict(iris.data) assert_array_equal(prediction_from_array, prediction_from_list)
def test_ovo_string_y(): # Test that the OvO doesn't mess up the encoding of string labels X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) ovo = OneVsOneClassifier(LinearSVC()) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def test_ovo_string_y(): "Test that the OvO doesn't screw the encoding of string labels" X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) svc = LinearSVC() ovo = OneVsOneClassifier(svc) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def OneVsOne(inputs_train, inputs_valid, target_train, target_valid): name = "Multiclass One Vs One" clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(inputs_train, np.ravel(target_train)) prediction = clf.predict(inputs_valid) correct = np.count_nonzero(np.ravel(target_valid) == prediction) total = target_valid.shape[0] correctRate = (float(correct)/total)*100 return name, correctRate
def test_ovo_fit_predict(): # A classifier which implements decision_function. ovo = OneVsOneClassifier(LinearSVC(random_state=0)) ovo.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2) # A classifier which implements predict_proba. ovo = OneVsOneClassifier(MultinomialNB()) ovo.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)
def svm(X,Y): X_train = np.array([x for i, x in enumerate(X) if i % 7 != 0], dtype = np.uint8) y_train = np.array([z for i, z in enumerate(Y) if i % 7 != 0], dtype = np.uint8) X_test = np.array([x for i, x in enumerate(X) if i % 10 == 0], dtype = np.uint8) y_test = np.array([z for i, z in enumerate(Y) if i % 10 == 0], dtype = np.uint8) clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) y_predicted = rf.predict(X_test) results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)] accuracy = float(results.count(True)) / float(len(results)) print accuracy
def test_pairwise_indices(): clf_precomputed = svm.SVC(kernel="precomputed") X, y = iris.data, iris.target ovr_false = OneVsOneClassifier(clf_precomputed) linear_kernel = np.dot(X, X.T) ovr_false.fit(linear_kernel, y) n_estimators = len(ovr_false.estimators_) precomputed_indices = ovr_false.pairwise_indices_ for idx in precomputed_indices: assert_equal(idx.shape[0] * n_estimators / (n_estimators - 1), linear_kernel.shape[0])
def test_ovo_partial_fit_predict(): temp = datasets.load_iris() X, y = temp.data, temp.target ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches have binary target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:60], y[:60], np.unique(y)) ovo1.partial_fit(X[60:], y[60:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(y))) assert_greater(np.mean(y == pred1), 0.65) ovo = OneVsOneClassifier(MultinomialNB()) X = np.random.rand(14, 2) y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2] ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4]) ovo.partial_fit(X[7:], y[7:]) pred = ovo.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) # raises error when mini-batch does not have classes from all_classes ovo = OneVsOneClassifier(MultinomialNB()) error_y = [0, 1, 2, 3, 4, 5, 2] message_re = escape("Mini-batch contains {0} while " "it must be subset of {1}".format(np.unique(error_y), np.unique(y))) assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7], error_y, np.unique(y)) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) assert_false(hasattr(ovr, "partial_fit"))
def test_ovo_ties(): # test that ties are broken using the decision function, not defaulting to # the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) # recalculate votes to make sure we have a tie predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_]) scores = np.vstack([clf.decision_function(X) for clf in multi_clf.estimators_]) # classifiers are in order 0-1, 0-2, 1-2 # aggregate votes: votes = np.zeros((4, 3)) votes[np.arange(4), predictions[0]] += 1 votes[np.arange(4), 2 * predictions[1]] += 1 votes[np.arange(4), 1 + predictions[2]] += 1 # for the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # for the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # for the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], 0) # in the zero-one classifier, the score for 0 is greater than the score for # one. assert_greater(scores[0][0], scores[0][1]) # score for one is greater than score for zero assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0]) # score for one is greater than score for two assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
def svm_classification(genres, features_type): training_set_features = tf.read_features_from_files("../../music/training", genres, features_type) testing_set_features = tf.read_features_from_files("../../music/testing", genres, features_type) X = [] y = [] for feature in training_set_features: (mean, cov_mat, genre_name) = feature X.append(mean.tolist()) y.append(tf.get_genre_ID(genre_name)) training_data = np.array(X) training_class = np.array(y) X = [] y = [] for feature in testing_set_features: (mean, cov_mat, genre_name) = feature X.append(mean.tolist()) y.append(tf.get_genre_ID(genre_name)) testing_data = np.array(X) testing_class = np.array(y) clf = OneVsOneClassifier(SVC(kernel='linear')) result_class = np.array(clf.fit(training_data, training_class).predict(testing_data)) rt.print_accuracy(list(testing_class), list(result_class), genres, features_type, "svm") rt.write_accuracy_to_file("../../music/", list(testing_class), list(result_class), genres, features_type, "svm")
def test_ovo_decision_function(): n_samples = iris.data.shape[0] ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0)) # first binary ovo_clf.fit(iris.data, iris.target == 0) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples,)) # then multi-class ovo_clf.fit(iris.data, iris.target) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples, n_classes)) assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data)) # Compute the votes votes = np.zeros((n_samples, n_classes)) k = 0 for i in range(n_classes): for j in range(i + 1, n_classes): pred = ovo_clf.estimators_[k].predict(iris.data) votes[pred == 0, i] += 1 votes[pred == 1, j] += 1 k += 1 # Extract votes and verify assert_array_equal(votes, np.round(decisions)) for class_idx in range(n_classes): # For each sample and each class, there only 3 possible vote levels # because they are only 3 distinct class pairs thus 3 distinct # binary classifiers. # Therefore, sorting predictions based on votes would yield # mostly tied predictions: assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.]))) # The OVO decision function on the other hand is able to resolve # most of the ties on this data as it combines both the vote counts # and the aggregated confidence levels of the binary classifiers # to compute the aggregate decision function. The iris dataset # has 150 samples with a couple of duplicates. The OvO decisions # can resolve most of the ties: assert_greater(len(np.unique(decisions[:, class_idx])), 146)
def trainOneVsOne2( histograms ): xAll = convertToSvmFormatFeature(histograms) scaleParam = computeScaleParameters(xAll) scaleFeatureData(xAll,scaleParam) xAll = np.array(xAll) yAll = [ x['label'] for x in histograms ] yAll = np.array(yAll) # svm = OneVsOneClassifier(LinearSVC(random_state=0,dual=svm_conf['dual'],C=svm_conf['C'])) gammaBase = 1.0/kmeans_conf['K'] # svm = OneVsOneClassifier(sklearn.svm.SVC(C=100, gamma=10*gammaBase,kernel='rbf')) svm = OneVsOneClassifier(sklearn.svm.SVC(C=1000, gamma=gammaBase,kernel='sigmoid')) svm.fit(xAll,yAll) out = {'scaleParam':scaleParam,'svm':svm} return out
def test_ovo_ties2(): # test that ties can not only be won by the first two labels X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y_ref = np.array([2, 0, 1, 2]) # cycle through labels so that each label wins once for i in range(3): y = (y_ref + i) % 3 multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) assert_equal(ovo_prediction[0], i % 3)
def test_multicluster(self): c = BinaryTiloClassifier(PinchRatioCutStrategy(), similarity.Gaussian()) ##c = BinaryTiloClassifier(similarity.KNN()) ##mcc = OneVsRestClassifier(c) mcc = OneVsOneClassifier(c) data = self.three_class_pts classes = self.three_class_labels peturbed_data = data + 0.01 * np.random.random(data.shape) fitted = mcc.fit(peturbed_data, classes) guesses = fitted.predict(peturbed_data) assert_array_equal(guesses, classes)
def learn(cat1,cat2,cat3): X = [] Y = [] IDF=get_IDF([cat1,cat2,cat3]) for d in cat1: X.append(MapToEvalVS(d,IDF)); Y.append(0) for d in cat2: X.append(MapToEvalVS(d,IDF)); Y.append(1) for d in cat3: X.append(MapToEvalVS(d,IDF)); Y.append(2) X=np.array(X) Y=np.array(Y) #clf = svm.SVC(verbose=True) #clf=svm.SVC() clf = OneVsOneClassifier(svm.SVC()) #clf=KNeighborsClassifier(weights='distance') clf.fit(X, Y) return [clf,IDF]
def analysis(self, testanalysis=True): if testanalysis: trainingdata, testdata = self.getTrainTestData() else: trainingdata, testdata = self.getRealData() aDict = {} for value in trainingdata: phrase = value.Phrase phrase = phrase.strip() aDict[phrase] = value.Sentiment _all_values = aDict.keys() _all_sentiments = aDict.values() # self.KFOLDTEST(np.asarray(_all_values), np.asarray(_all_sentiments)) count_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data) count = count_vectorizer.fit_transform(_all_values) # self.countWordFreq(count_vectorizer, count) tfidf = TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False) data = tfidf.fit_transform(count) classfier = OneVsOneClassifier(LinearSVC()) classfier.fit(data, np.asarray(_all_sentiments)) # Data to write the content into the CSV , for getting this comment the above to take entire training set # as the real data # along with that call the method @getRealData if testanalysis: self.normalexecution(testdata, count_vectorizer, tfidf, classfier) else: self.writeToFile(testdata, count_vectorizer, tfidf, classfier)
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
from sklearn import svm, datasets from sklearn.multiclass import OneVsOneClassifier #调用SVC() clf = svm.SVC() #载入鸢尾花数据集 iris = datasets.load_iris() print(iris) X, y = iris.data, iris.target clf = svm.LinearSVC(random_state=0) clf = OneVsOneClassifier(clf) # 根据二分类器构建多分类器 clf.fit(X, y) # 训练模型 y_pred = clf.predict(X) # 预测样本 print('预测正确的个数:%d,预测错误的个数:%d' % ((y == y_pred).sum(), (y != y_pred).sum()))
clf = OneVsOneClassifier(LinearSVC(C=1.0, random_state=0)) pred = cross_val_predict(clf, features, target, cv=30, n_jobs=-1) print( classification_report(target, pred, target_names=train_val_data.label.unique())) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(min_samples_split=5, random_state=0, n_estimators=100, n_jobs=-1, verbose=1, class_weight="balanced") clf.fit(features, target) val_features = validation_data.drop(['label'], axis=1).values val_target = validation_data.label.values predicted = clf.predict(val_features) print( classification_report(val_target, predicted, target_names=train_val_data.label.unique())) test_features = test_data.drop(['label'], axis=1).values test_target = test_data.label.values predicted = clf.predict(test_features) print( classification_report(test_target, predicted,
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer : int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict : int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : int, RandomState instance or None, optional (default: None) The generator used to initialize the centers. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by `np.random`. multi_class : string, default : "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_ : float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data .. versionadded:: 0.18 """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes; got %d class (only class %s " "is present)" % (self.n_classes_, self.classes_[0])) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean([ estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_ ]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel([ estimator.kernel_ for estimator in self.base_estimator_.estimators_ ]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelihood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean([ estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators) ]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean([ estimator.log_marginal_likelihood(theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators) ]) else: raise ValueError( "Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
#Regression Module Testing def Build_Data_Set(features = ["citation2yrs","RDI","rcount","auth_prod_avg","authprod_max","auth_div_avg","auth_div_max","auth_hindex_avg","auth_hindex_max","auth_soc_avg","auth_soc_max","team"]): citation_data = pd.DataFrame.from_csv("./workspace/SVR.txt") citation_data = citation_data.loc[citation_data['paper_year'] <= 1995] citation_data.iloc[np.random.permutation(len(citation_data))] X = np.array(citation_data[features].values) #X = preprocessing.scale(X) y = (citation_data["paper_cat"].values.tolist()) return X,y X, y = Build_Data_Set() clf = OneVsOneClassifier(SVC(random_state=0, kernel='rbf')) print("Learning started") clf.fit(X,y) #Saving the classifier import pickle save_classifier = open("./workspace/rbfsvr.pickle","wb") pickle.dump(clf, save_classifier) save_classifier.close() #testing def Build_Data_Set(features = ["citation2yrs","RDI","rcount","auth_prod_avg","authprod_max","auth_div_avg","auth_div_max","auth_hindex_avg","auth_hindex_max","auth_soc_avg","auth_soc_max","team"]): citation_data = pd.DataFrame.from_csv("./workspace/SVR.txt") citation_data = citation_data.loc[citation_data['paper_year'] >= 1996] citation_data.iloc[np.random.permutation(len(citation_data))] X = np.array(citation_data[features].values)
def fit(self, X, y): self._fit_y = y self.nbrs.fit(X, y) return OneVsOneClassifier.fit(self, X, y)
class EveryWordOneFeature(object): def __init__(self, slack=1, gamma=1, kernelType='linear', gram=1): self.gram = gram self.slack = slack self.gamma = gamma self.kernelType = kernelType self.data = np.ones((1000, 1000)) self.cityClassifier = {} #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller? self.countryClassifier = OneVsOneClassifier( svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False, cache_size=1000)) self.bag = None self.numberOfFeatures = 0 #Features and labels self.fitting_data = None self.predict_data = None self.cityPrediction = {} self.countryPrediction = None self.numberOfCityFeatures = {} def fit_cities(self, trainingData, labels, countryCode): print "Start fitting cities for country " + str(countryCode) #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller? self.cityClassifier[countryCode] = OneVsOneClassifier( svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False)) start = time.time() self.cityClassifier[countryCode].fit(trainingData[:, :self.get_number_of_city_features(countryCode)], labels) end = time.time() print "Finished fitting cities in " + str((end - start)) + "s" def fit_countries(self): print "Start Fitting countries" start = time.time() self.countryClassifier.fit(self.fitting_data[:, :self.numberOfFeatures], self.fitting_data[:, (self.numberOfFeatures + 1)]) end = time.time() print "Finished fitting countries in " + str((end - start)) + "s" def preprocess_training_data(self, data): startOfPreprocessing = time.time() print "Start Preprocessing" lengthOfTrainingData = self.data.shape[0] print "length of trainingData = " + str(lengthOfTrainingData) self.bag = BagOfWords(data) self.fitting_data = self.bag.get_features_and_labels() self.numberOfFeatures = self.fitting_data.shape[1] - 2 startOfFittingCities = time.time() print "Finished Preprocessing in " + str((startOfFittingCities - startOfPreprocessing)) + "s" def fit(self, data): self.data = data self.preprocess_training_data(data) self.numberOfFeatures = self.fitting_data.shape[1] - 2 self.fit_countries() def predict_cities(self, data, countryCode): print "Start predict cities" start = time.time() print data[:, :self.numberOfFeatures] print self.cityPrediction self.cityPrediction[countryCode] = self.cityClassifier[countryCode].predict(data[:, :self.get_number_of_city_features(countryCode)]) end = time.time() print "Finished predicting cities in " + str((end - start)) + "s" def predict_countries(self): start = time.time() print "start predicting countries" print self.predict_data self.countryPrediction = self.countryClassifier.predict(self.predict_data[:, :self.numberOfFeatures]) end = time.time() print "finished predicting countries in " + str((end - start)) + "s" def preprocess_predict_data(self, predict): self.predict_data = self.bag.get_get_validation_features(predict) def get_city_featuers(self, data, countryCode): return np.zeros((data.shape[0], 3)) def predict(self, predict): self.preprocess_predict_data(predict) self.numberOfFeatures = self.predict_data.shape[1] # t1 = threading.Thread(target=self.predict_cities) self.predict_countries() joinedCityPredictions = np.zeros(predict.shape[0]) countryCodes = np.unique(self.data[:, 2].astype(int)) for countryCode in countryCodes: countryIndices = np.where(self.data[:, 2].astype(int) == countryCode)[0] self.fit_cities(self.get_city_featuers(self.data[countryIndices][:, 0],countryCode), self.data[countryIndices][:,1], countryCode) countryCodes = np.unique(self.countryPrediction) for countryCode in countryCodes: countryIndices = np.where(self.countryPrediction == countryCode)[0] self.predict_cities(self.get_city_featuers(predict[countryIndices], countryCode), countryCode) joinedCityPredictions[countryIndices] = self.cityPrediction[countryCode] prediction = np.vstack((joinedCityPredictions, self.countryPrediction)).T return prediction def get_number_of_city_features(self, cityCode): return 3
def main(): ####################################### # Saima Aman emotion blog data # replacing with our data global AddFeatures AddFeatures=["TEXTLEN","TITLELEN","STARS","VERIFIEDPURCHASE","BADGE","COMMENTS","FORMAT"] ourdatatuples = getmyxls() print "Length of ourdatatuples is: ", len(ourdatatuples) #shuffle(saimaDataTuples) print "saimaDataTuples", ourdatatuples[0] trainTuples=ourdatatuples#[:1000] #testTuples=saimaDataTuples[1000:] # ####################################### myData=getThreeColumnDataDict(ourdatatuples) #print(myData) print "lol: mydata " #print(myData) totalCount=sum([len(myData[k]) for k in myData]) print totalCount # del trainLines # print"*"*50 getDataStats(myData) # dataTuples=getLabeledDataTuples(myData) # #################################### # # Add first 1000 Saima tuples # #dataTuples=dataTuples+saimaDataTuples[:1000] # print dataTuples[0] # del myData ids, labels, vectors= getLabelsAndVectors(trainTuples) #print labels space=getSpace(vectors) print "Total # of features in your space is: ", len(space) # augment space with emotion features... space= augmentSpace(space, AddFeatures) #reducedSpace=getReducedSpace(vectors, space) print "Total # of features in your augmented space is: ", len(space) print "Predicted error" #print "Total # of features in your reducedSpace is: ", len(reducedSpace) oneHotVectors=getOneHotVectors(ids, labels, vectors, space) print(oneHotVectors[0]) vectors, labels=getOneHotVectorsAndLabels(oneHotVectors) del oneHotVectors trainVectors = vectors trainLabels = labels #trainLabels.fit_transform([('H','NH')]) #trainLabels = preprocessing.label_binarize(trainLabels,classes=[unicode("H"),unicode("NH")]) #del vectors #del labels #C, gamma = getCAndGamma(trainVectors, trainLabels, kernel = 'rbf') # Train classifier clf = OneVsOneClassifier(SVC(kernel='linear', class_weight='auto', verbose= True, probability=True)) #clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear', gamma=1, verbose= False, probability=False)) clf.fit(trainVectors, trainLabels) print "\nDone fitting classifier on training data...\n" #testVectors = vectors[200:250] #testLabels = labels[200:250] #predicted_testLabels = clf.predict(testVectors) #print "Done predicting on DEV data...\n" #print "classification_report:\n", classification_report(testLabels, predicted_testLabels)#, target_names=target_names) #print "accuracy_score:", round(accuracy_score(testLabels, predicted_testLabels), 2) #del trainVectors #del trainLabels # saimaDataTuples=getSAIMAThreeColumnFormat() # print "Length of saimaDataTuples is: ", len(saimaDataTuples) # shuffle(saimaDataTuples) # print "saimaDataTuples", saimaDataTuples[0] # ids, labels, vectors= getLabelsAndVectors(testTuples) # oneHotVectors=getOneHotVectors(ids, labels, vectors, space) # vectors, labels=getOneHotVectorsAndLabels(oneHotVectors) # del oneHotVectors # testVectors = vectors # testLabels = labels # predicted_testLabels = clf.predict(testVectors) #------------------------------------------------------------------------------------------ print "="*50, "\n" print "Results with 5-fold cross validation:\n" print "="*50, "\n" #------------------------------------------------------------------------------------------ predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=5) print "*"*20 print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted) print "*"*20 print "precision_score\t", metrics.precision_score(trainLabels, predicted,pos_label=unicode("H"),average='binary') print "recall_score\t", metrics.recall_score(trainLabels, predicted,pos_label=unicode("H"),average='binary') print "\nclassification_report:\n\n", metrics.classification_report(trainLabels, predicted) print "\nconfusion_matrix:\n\n", metrics.confusion_matrix(trainLabels, predicted) '''#"------------------------------------------------------------------------------------------
label_encoder = [] x_encoded = np.empty(x.shape) for i, item in enumerate(x[0]): if item.isdigit(): x_encoded[:, i] = x[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) x_encoded[:, i] = label_encoder[-1].fit_transform(x[:, i]) x = x_encoded[:, :-1].astype(int) y = x_encoded[:, -1].astype(int) # Create SVM classifier classifier = OneVsOneClassifier(LinearSVC(random_state=0)) # Train the classifier classifier.fit(x, y) # Cross validation x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(x_train, y_train) y_test_pred = classifier.predict(x_test) # Compute the F1 score f1 = cross_val_score(classifier, x, y, scoring='f1_weighted', cv=3) print("F1 Score: " + str(round(100 * f1.mean(), 2)) + '%') # Predict output for a test datapoint
def plot_classification(): # modified http://scikit-learn.org/stable/auto_examples/svm/plot_iris.html#sphx-glr-auto-examples-svm-plot-iris-py iris = datasets.load_iris() X = iris.data[:, :2] # only want to classify what type of iris it is y = iris.target C = 1.0 # SVM regularization parameter h = .02 # step size in the mesh svm_c = OneVsOneClassifier(SVC(kernel='linear', C=C)) svm_l = OneVsRestClassifier(LinearSVC(C=C)) x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1 y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) titles = ['One-vs-One', 'One-vs-All'] plt.suptitle('Linear SVM') # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. plt.subplot(1, 2, 0 + 1) plt.subplots_adjust(wspace=0.4, hspace=0.4) Z_c = svm_c.fit(X, y).predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z_c = Z_c.reshape(xx.shape) plt.contourf(xx, yy, Z_c, cmap=plt.cm.coolwarm, alpha=0.8) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title(titles[0]) # Plot the decision boundary. For that, we will assign a color to each # point in the mesh [x_min, x_max]x[y_min, y_max]. plt.subplot(1, 2, 1 + 1) plt.subplots_adjust(wspace=0.4, hspace=0.4) Z_l = svm_l.fit(X, y).predict(np.c_[xx.ravel(), yy.ravel()]) # Put the result into a color plot Z_l = Z_l.reshape(xx.shape) plt.contourf(xx, yy, Z_l, cmap=plt.cm.coolwarm, alpha=0.8) # Plot also the training points plt.scatter(X[:, 0], X[:, 1], c=y, cmap=plt.cm.coolwarm) plt.xlabel('Sepal length') plt.ylabel('Sepal width') plt.xlim(xx.min(), xx.max()) plt.ylim(yy.min(), yy.max()) plt.xticks(()) plt.yticks(()) plt.title(titles[1]) plt.savefig('images/svm_linear_1v1_1va.png') plt.close()
# # Prepare the cross-validation procedure # from sklearn.model_selection import KFold # from sklearn.model_selection import cross_val_score # cv = KFold(n_splits=10, random_state=1, shuffle=True) # scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1) # print('KFold Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores))) # Prepare the repeated cross-validation procedure # from sklearn.model_selection import RepeatedKFold # from sklearn.model_selection import cross_val_score # cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1) # scores = cross_val_score(model, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1) # print('Repeated KFold Accuracy: %.3f (%.3f)' % (np.mean(scores), np.std(scores))) # Fit the model model.fit(X_train, y_train) y_pred = model.predict(X_test) y_comp = pd.DataFrame(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.reshape(len(y_test),1)),1)) # Confusion matrix from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score fig, ax = plt.subplots(figsize=(7,4)) cm = confusion_matrix(y_test, y_pred) xlabel = ['login', 'setting'] sns.heatmap(cm/np.sum(cm), annot=True, fmt='.0%', cmap='Blues', ax=ax, vmin=0, linewidths=1, xticklabels=xlabel, yticklabels=xlabel) plt.savefig('test.pdf') acc = accuracy_score(y_test, y_pred) prec = precision_score(y_test, y_pred, average=None) rec = recall_score(y_test, y_pred, average=None) f1 = f1_score(y_test, y_pred, average=None)
plt.scatter(X[y == 0, 0], X[y == 0, 1]) plt.scatter(X[y == 1, 0], X[y == 1, 1]) plt.scatter(X[y == 2, 0], X[y == 2, 1]) plt.show() # 使用所有数据 X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666) # OvR log_reg = LogisticRegression() log_reg.fit(X_train, y_train) print(log_reg.score(X_test, y_test)) # OvO log_reg2 = LogisticRegression(multi_class="multinomial", solver="newton-cg") log_reg2.fit(X_train, y_train) print(log_reg2.score(X_test, y_test)) # sklearn 关于 OvR 和 OvO 的其他支持 ovr = OneVsRestClassifier(log_reg) ovr.fit(X_train, y_train) print(ovr.score(X_test, y_test)) ovo = OneVsOneClassifier(log_reg) ovo.fit(X_train, y_train) print(ovo.score(X_test, y_test))
np.argmax(some_digit_scores) # In[25]: SGD_C.classes_ # In[26]: # one-versus-one and one-versus-all from sklearn.multiclass import OneVsOneClassifier OvO_C = OneVsOneClassifier(SGDClassifier(random_state=42)) OvO_C.fit(X_train, Y_train) OvO_C.predict([some_digit]) # In[27]: len(OvO_C.estimators_) # In[28]: forest_clf.fit(X_train, Y_train) forest_clf.predict_proba([some_digit])
train_label = np.append(train_label, np.load('macro-mapping/' + str(train_index[i]) + '.npy'), axis=0) for i in range(1, len(test_index)): temp = np.load('alpha_carbon/' + str(test_index[i]) + '.npy') test_data = np.append(test_data, np.load('alpha_carbon/' + str(test_index[i]) + '.npy'), axis=0) del temp test_label = np.append(test_label, np.load('macro-mapping/' + str(test_index[i]) + '.npy'), axis=0) clf = OneVsOneClassifier( RandomForestClassifier(n_estimators=50, max_depth=depth, random_state=0)) clf.fit(train_data, train_label) print('Fold: %d Depth %d Train Accu: %.3f Test Accu: %.3f' % (fold, depth, np.sum(clf.predict(train_data) == train_label) / len(train_label), np.sum(clf.predict(test_data) == test_label) / len(test_label))) del train_data, test_data, train_label, test_label ## save model from sklearn.externals import joblib joblib.dump(clf, 'ovo-randomforest/' + str(fold) + "_" + str(depth) + ".pkl")
def main(): ####################################### # Saima Aman emotion blog data # replacing with our data global AddFeatures AddFeatures = [ "TEXTLEN", "TITLELEN", "STARS", "VERIFIEDPURCHASE", "BADGE", "COMMENTS", "FORMAT" ] ourdatatuples = getmyxls() print "Length of ourdatatuples is: ", len(ourdatatuples) #shuffle(saimaDataTuples) print "saimaDataTuples", ourdatatuples[0] trainTuples = ourdatatuples #[:1000] #testTuples=saimaDataTuples[1000:] # ####################################### myData = getThreeColumnDataDict(ourdatatuples) #print(myData) print "lol: mydata " #print(myData) totalCount = sum([len(myData[k]) for k in myData]) print totalCount # del trainLines # print"*"*50 getDataStats(myData) # dataTuples=getLabeledDataTuples(myData) # #################################### # # Add first 1000 Saima tuples # #dataTuples=dataTuples+saimaDataTuples[:1000] # print dataTuples[0] # del myData ids, labels, vectors = getLabelsAndVectors(trainTuples) #print labels space = getSpace(vectors) print "Total # of features in your space is: ", len(space) # augment space with emotion features... space = augmentSpace(space, AddFeatures) #reducedSpace=getReducedSpace(vectors, space) print "Total # of features in your augmented space is: ", len(space) print "Predicted error" #print "Total # of features in your reducedSpace is: ", len(reducedSpace) oneHotVectors = getOneHotVectors(ids, labels, vectors, space) print(oneHotVectors[0]) vectors, labels = getOneHotVectorsAndLabels(oneHotVectors) del oneHotVectors trainVectors = vectors trainLabels = labels #trainLabels.fit_transform([('H','NH')]) #trainLabels = preprocessing.label_binarize(trainLabels,classes=[unicode("H"),unicode("NH")]) #del vectors #del labels #C, gamma = getCAndGamma(trainVectors, trainLabels, kernel = 'rbf') # Train classifier clf = OneVsOneClassifier( SVC(kernel='linear', class_weight='auto', verbose=True, probability=True)) #clf = OneVsRestClassifier(SVC(C=1, kernel = 'linear', gamma=1, verbose= False, probability=False)) clf.fit(trainVectors, trainLabels) print "\nDone fitting classifier on training data...\n" #testVectors = vectors[200:250] #testLabels = labels[200:250] #predicted_testLabels = clf.predict(testVectors) #print "Done predicting on DEV data...\n" #print "classification_report:\n", classification_report(testLabels, predicted_testLabels)#, target_names=target_names) #print "accuracy_score:", round(accuracy_score(testLabels, predicted_testLabels), 2) #del trainVectors #del trainLabels # saimaDataTuples=getSAIMAThreeColumnFormat() # print "Length of saimaDataTuples is: ", len(saimaDataTuples) # shuffle(saimaDataTuples) # print "saimaDataTuples", saimaDataTuples[0] # ids, labels, vectors= getLabelsAndVectors(testTuples) # oneHotVectors=getOneHotVectors(ids, labels, vectors, space) # vectors, labels=getOneHotVectorsAndLabels(oneHotVectors) # del oneHotVectors # testVectors = vectors # testLabels = labels # predicted_testLabels = clf.predict(testVectors) #------------------------------------------------------------------------------------------ print "=" * 50, "\n" print "Results with 5-fold cross validation:\n" print "=" * 50, "\n" #------------------------------------------------------------------------------------------ predicted = cross_validation.cross_val_predict(clf, trainVectors, trainLabels, cv=5) print "*" * 20 print "\t accuracy_score\t", metrics.accuracy_score(trainLabels, predicted) print "*" * 20 print "precision_score\t", metrics.precision_score(trainLabels, predicted, pos_label=unicode("H"), average='binary') print "recall_score\t", metrics.recall_score(trainLabels, predicted, pos_label=unicode("H"), average='binary') print "\nclassification_report:\n\n", metrics.classification_report( trainLabels, predicted) print "\nconfusion_matrix:\n\n", metrics.confusion_matrix( trainLabels, predicted) '''#"------------------------------------------------------------------------------------------
def main(): # import the data from sklearn.datasets import fetch_openml mnist = fetch_openml('mnist_784') x, y = mnist["data"], mnist["target"] print(x.shape) print(y.shape) # show the image some_digit = x[36000] some_digit_image = some_digit.reshape(28, 28) plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest") plt.axis("off") plt.show() # prepare the testing/training tests x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:] np.random.seed(3) shuffle_index = np.random.permutation(60000) x_train, y_train = x_train[shuffle_index], y_train[shuffle_index] # Binary Classifier y_train_5 = (y_train == '5') # True for all 5s y_test_5 = (y_test == '5') # make sure it's int not chars from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(x_train, y_train_5) # enable the model print(sgd_clf.predict([some_digit])) # implement Cross-Validation from sklearn.model_selection import StratifiedKFold from sklearn.base import clone skfolds = StratifiedKFold(n_splits=3, random_state=42) for train_index, test_index in skfolds.split(x_train, y_train_5): clone_clf = clone(sgd_clf) # train clone on training folds, then predict on test fold x_train_folds = x_train[train_index] y_train_folds = y_train_5[train_index] x_test_fold = x_train[test_index] y_test_fold = y_train_5[test_index] clone_clf.fit(x_train_folds, y_train_folds) y_pred = clone_clf.predict(x_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred)) # evaluate the model with 'accuracy' from sklearn.model_selection import cross_val_score cross_val_score = cross_val_score(sgd_clf, x_train, y_train_5, cv=3, scoring="accuracy") print(cross_val_score) # see accuracy from a non5classifier from sklearn.base import BaseEstimator class Never5Classifier(BaseEstimator): def fit(self, x, y=None): pass def predicit(self, x): return np.zeros((len(x), 1), dtype=bool) never_5_clf = Never5Classifier() never_5_clf_score = cross_val_score(never_5_clf, x_train, y_train_5, cv=3, scoring="accuracy") print(never_5_clf_score) # evaluate the model with 'confusion matrix' from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3) confusion_matrix = confusion_matrix(y_train_5, y_train_pred) print(confusion_matrix) # precision and recall from sklearn.metrics import precision_score, recall_score, f1_score precision_score = precision_score(y_train_5, y_train_pred) recall_score = recall_score(y_train_5, y_train_pred) f1_score = f1_score(y_train_5, y_train_pred) print(precision_score) print(recall_score) print(f1_score) # f1 score is the harmonic mean of precision and recall # precision vs recall trade-off from sklearn.metrics import precision_recall_curve def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): plt.plot(thresholds, precisions[:-1], "b--", label="precision") # function to plot precision vs threshold plt.plot(thresholds, recalls[:-1], "g-", label="recall") plt.xlabel("Threshold", fontsize=16) plt.legend(loc="upper left", fontsize=16) plt.ylim([0, 1]) def plot_precision_vs_recall(precisions, recalls): plt.plot(recalls, precisions, "b-", linewidth=2) plt.xlabel("recall", fontsize=16) plt.ylabel("precision", fontsize=16) plt.axis([0, 1, 0, 1]) y_scores = cross_val_predict(sgd_clf, x_train, y_train, cv=3, method="decision_function") # return decision value if y_scores.ndim == 2: y_scores = y_scores[:, 1] # to get around with the issue of "extra first dimension" precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) plot_precision_recall_vs_threshold(precisions, recalls, thresholds) plot_precision_vs_recall(precisions, recalls) plt.show() # manly set the threshold y_train_pred_90 = (y_scores > 70000) # gain new trained dataset precision_score = precision_score(y_train_5, y_train_pred_90) recall_score = recall_score(y_train_5, y_train_pred_90) print("precision_score=", precision_score) print("recall_score=", recall_score) # ROC curve from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plot_roc_curve(fpr, tpr) plt.show() from sklearn.metrics import roc_auc_score from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method="predict_proba") # have no decision_function y_scores_forest = y_probas_forest[:, 1] # extract the score from probability metrics fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest) plt.plot(fpr, tpr, "b:", label="SGD") plot_roc_curve(fpr_forest, tpr_forest, "random Forest") plt.legend(loc="lower right") plt.show() roc_auc_score = roc_auc_score(y_train_5, y_scores_forest) print(roc_auc_score) # Multiclass classification sgd_clf.fit(x_train, y_train) # train the model to the all set. sgd_clf.predict([some_digit]) some_digit_score = sgd_clf.decision_function([some_digit]) # obtain score for each class print(some_digit_score) # OvO classifier from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(x_train, y_train) print(ovo_clf.predict([some_digit])) forest_clf.fit(x_train, y_train) print(forest_clf.predict_proba([some_digit])) sgd_clf_score = cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy") print(sgd_clf_score) # here the score is for multiclass classification as for y_train from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train.astype(np.float64)) sgd_clf_score(sgd_clf, x_train_scaled, y_train, cv=3, scoring="accuracy") print(sgd_clf_score) # scaling can improve the accuracy for model # error analysis y_train_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) # row for actual, column for predicted print(conf_mx) plt.matshow(conf_mx, cmap=plt.cm.gray) # showing the matrix with a image plt.show() row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums # transform error number into error rate np.fill_diagonal(norm_conf_mx, 0) # keep only the errors plt.matshow(norm_conf_mx, cmap=plt.cm.gray) plt.show() # multilabel classification from sklearn.neighbors import KNeighborsClassifier y_train_large = (y_train >= 7) y_train_odd = (y_train % 2 == 1) # imply odd number in this way y_multilabel = np.c_[y_train_large, y_train_odd] knn_clf = KNeighborsClassifier() # KNeighborClassifier for multilabel knn_clf.fit(x_train, y_multilabel) print(knn_clf.predcit([some_digit])) # multioutput classification import numpy.random as rnd noise1 = rnd.randint(0, 100, len(x_train), 784) noise2 = rnd.randint(0, 100, (len(x_train), 784)) # grant noise and try to clean x_train_mod = x_train +noise1 x_test_mod = x_test + noise2 y_train_mod = x_train y_test_mod = x_test knn_clf.fit(x_train_mod, y_train_mod) clean_digit = knn_clf.predict([x_test_mod[1]]) plot_digit(clean_digit)
class perceptronMOM(BaseEstimator): '''Perceptron MOM classifier. Perceptron MOM risk minimization. The Perceptron minimize the perceptron loss using SGD without regularization. Parameters ---------- w0 : array-like, length = n_features + 1, default ones(n_features + 1) initial coefficients (including the intercept) of the classifier. K : int, default 10 number of blocks for the computation of the MOM. A big value of K deals with more outliers but small values of K are better for the performance when there are no outliers. eta0 : float, default 1 step size parameter, the step size is defined as the i-th iteration by 1/(1+eta0*i). epoch : int, default 200 number of iterations before the end of the algorithm. mu : float between 0 and 1, default 0.95 coefficient in the momentum. agg : int, default 1 number of runs of the algorithm on which we aggregate. One might want to decrease this number if the complexity is a problem. compter : boolean, default False used for outlier detection, if compter=True, the number of time each point is used in the algorithm will be recorded in the attribute "counts". progress : boolean, default False display a progress bar to monitor the algorithm on each run (agg > 1 means several progress bar). verbose : boolean, default True display a message at the end of each run if agg > 1. multi : {'ovr','ovo'} , default 'ovr' method used to go from binary classification to multiclass classification. 'ovr' means "one vs the rest" and 'ovo' means "one vs one" . Attributes ---------- w0 : array like, length = n_features + 1 w0 is updated in the algorithm, provides with the final coefficients of the decision function. counts : array like, length = n_sampled the i-th element record the number of time the i-th element of the training dataset X has been used. Only if compter=True. Methods ------- fit(X,y) : fit the model X : numpy matrix size = (n_samples,n_features) y : array like, length = n_samples predict(X) : predict the class of the points in X X : numpy matrix size = (n_samples,n_features) returns array-like, length = n_samples. predict_proba(X) : predict the probability that each point belong to each class. X : numpy matrox size = (n_samples,n_features) returns matrix, size = (n_samples,n_class) ''' def __init__( self,w0=None,K=10,eta0=1,epoch=100,mu=0.95,agg=1,compter=False,progress=False, verbose = True, multi='ovr'): binary_clf=perceptronMOM_binary(w0,K,eta0,epoch,mu,agg,compter,progress,verbose) args, _, _, values = inspect.getargvalues(inspect.currentframe()) values.pop("self") for arg, val in values.items(): setattr(self, arg, val) if multi=="ovr": self.clf=OneVsRestClassifier(binary_clf) elif multi=="ovo": self.clf=OneVsOneClassifier(binary_clf) else: raise NameError('Multiclass meta-algorithm not known') def fit(self,X,y): self.clf.fit(X,y) return self def predict(self,X): return self.clf.predict(X) def predict_proba(self,X): return self.clf.predict_proba(X) def score(self,X,y): return np.mean(self.predict(X)==y) def set_params(self,**params): self.__init__(**params) return self
precisions_forest, recalls_forest, threshold_forest = precision_recall_curve(y_train_5,y_scores_forest) # problem with output #plot_pr_curve(recalls_forest,precision_forest) # SDG OvA sgd_clf.fit(X_train,y_train)# use the OvA sgd_clf.predict([some_digits]) some_digit_scores = sdg_clf.decision_function([some_digit]) # gives 10 scores, one for each class np.argmax(some_digit_scores) # the highest scores is the class 5 # SDG OvO ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(X_train,y_train) ovo_clf.predict([some_digit]) len(ovo_clf.estimators_) # Random Forest forest_clf.fit(X_train,y_train) forest_clf.predict([some_digit]) forest_clf.predict_proba([some_digit]) # evulate with cross_val cross_val_score(sgd_clf,X_train,y_train, cv=3, scoring='accuracy') # 84% accuracy. A random one would have 10% (1/10 chances) # Improve accuracy with scaling scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train.astype(np.float64)) # comapre with cahap2 cross_val_score ( sgd_clf, X_train_scaled, y_train, cv=3, scoring='accuracy')
def hybrid_classification_for_fold(x_train, x_test, y_train, y_test, estimator, scoring): ''' Runs the hybrid classification algorithm for each fold. ''' scaler = MinMaxScaler(feature_range=(0, 1)) x_train = scaler.fit_transform(x_train) x_test = scaler.transform(x_test) num_features = x_train.shape[1] fs = SelectKBest(scoring, k=2*num_features/3) #fs = SelectPercentile(scoring, percentile=50) x_train = fs.fit_transform(x_train, y_train) x_test = fs.transform(x_test) #############################' # PHASE 1 #############################' ovr = OneVsRestClassifier(estimator, n_jobs=-1) ovr.fit(x_train, y_train) ovr_estimators = ovr.estimators_ y_predict_ovr = get_ovr_estimators_prediction(ovr_estimators, x_test) #print y_predict_ovr # dimensions: no. of estimators X no. of samples. each row is the output of a particular estimator for # all the samples we sent in sample_predictions_per_ovr_estimator = np.transpose(y_predict_ovr) # dimensions: no. samples X no. ovr_estimators. # each row has the prediction of all ovr_estimators for a given sample. # remember that this is an OVR classification so each estimator fits one class only. # for that sample. e.g. # [[0 0 0 0 0 0 0 0] <- none of the ovr_estimators thought this sample belonged to their class # [0 0 0 1 0 0 0 0] <- ovr_estimator 3 thinks this sample belongs to its class # [0 0 0 1 0 0 0 1]] <- ovr_estimator 3 and 7 both think this sample belongs to their class #print sample_predictions_per_ovr_estimator test_indices_unclassified_in_phase1 = [] y_test_predict = np.ones(len(y_test))*-1 # -1 is an invalid value. Denotes an unclassified sample. for index, sample_prediction in enumerate(sample_predictions_per_ovr_estimator): if(np.sum(sample_prediction)==1): # only one estimator's decision_function is +ve y_test_predict[index] = ovr.classes_[np.nonzero(sample_prediction)[0][0]] else: test_indices_unclassified_in_phase1.append(index) #print 'Phase {phase} Correctly classified: {0:2.3f}'.format(float(np.sum(y_test_predict==y_test))/len(y_test), phase=1) #print 'Phase {phase} Unclassified: {0:2.3f}'.format(float(np.sum(y_test_predict==-1))/len(y_test), phase=1) correct_after_phase1 = float(np.sum(y_test_predict==y_test))/len(y_test) incorrect_after_phase1 = float(len(filter(lambda x: x <> -1, y_test_predict[y_test_predict<>y_test])))/len(y_test) unclassified_after_phase1 = float(np.sum(y_test_predict==-1))/len(y_test) #############################' # PHASE 2 #############################' ovo = OneVsOneClassifier(estimator, n_jobs=-1) ovo.fit(x_train, y_train) ovo_estimators = ovo.estimators_ for index in test_indices_unclassified_in_phase1: y_predict_ovo = get_ovo_estimators_prediction(ovo_estimators, ovo.classes_, np.reshape(x_test[index], (1, len(x_test[index])))) if y_predict_ovo <> -1: y_test_predict[index] = y_predict_ovo #print 'Phase {phase} Correctly classified: {0:2.3f}'.format(float(np.sum(y_test_predict==y_test))/len(y_test), phase=2) #print 'Phase {phase} Unclassified: {0:2.3f}'.format(float(np.sum(y_test_predict==-1))/len(y_test), phase=2) correct_after_phase2 = float(np.sum(y_test_predict==y_test))/len(y_test) incorrect_after_phase2 = float(len(filter(lambda x: x <> -1, y_test_predict[y_test_predict<>y_test])))/len(y_test) unclassified_after_phase2 = float(np.sum(y_test_predict==-1))/len(y_test) accuracy_score = metrics.accuracy_score(y_test_predict, y_test) return np.array([accuracy_score, correct_after_phase1, correct_after_phase2, incorrect_after_phase1,\ incorrect_after_phase2, unclassified_after_phase1, unclassified_after_phase2])
from sklearn import datasets from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVC iris = datasets.load_iris() data = [[1, 1, 1, 1], [2, 2, 2, 1], [3, 3, 3, 1], [4, 4, 4, 1], [1, 1, 1, 2], [1, 1, 1, 3], [1, 1, 1, 4]] classes = [1, 1, 1, 1, 2, 3, 4] X, y = iris.data, iris.target classifier = OneVsOneClassifier(LinearSVC(random_state=0)) print X print y classifier.fit(X, y) while True: to_predict = raw_input('Enter 4 numbers to predict: ') lst = to_predict.split() print lst new_lst = [] for num in lst: new_lst.append(float(num)) print classifier.predict([new_lst])
def main(): # load pickle arxiv_11 = pickle.load(open("2011_big_pop.p", "rb")) arxiv_12 = pickle.load(open("2012_big_pop.p", "rb")) topiclists = pickle.load(open("minor_subjects.p", "rb")) print "loaded pickles" # build doc set # build doc set doc_set = arxiv_11['astro'] + arxiv_11['cond'] + \ arxiv_11['cs'] + arxiv_11['hep'] + \ arxiv_11['math'] + arxiv_11['physics'] + \ arxiv_11['quant'] + arxiv_11['stat'] label_set = [1]*len(arxiv_11['astro']) + [2]*len(arxiv_11['cond']) + \ [3]*len(arxiv_11['cs']) + [4]*len(arxiv_11['hep']) + \ [5]*len(arxiv_11['math']) + [6]*len(arxiv_11['physics']) + \ [7]*len(arxiv_11['quant']) + [8]*len(arxiv_11['stat']) doc_texts = tokenize(doc_set) # build indiv training sets topic_superset = [] topic_superset.append(arxiv_11['astro']) topic_superset.append(arxiv_11['cond']) topic_superset.append(arxiv_11['cs']) topic_superset.append(arxiv_11['hep']) topic_superset.append(arxiv_11['math']) topic_superset.append(arxiv_11['physics']) topic_superset.append(arxiv_11['quant']) topic_superset.append(arxiv_11['stat']) # build individual lda lda_superset = [] num_topics_list = [] dictionary_set = [] for topic_set in topic_superset: topic_texts = tokenize(topic_set) # turn our tokenized documents into a id - term dictionary dictionary = corpora.Dictionary(topic_texts) dictionary_set.append(dictionary) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in topic_texts] # generate LDA model num_topics = math.floor(len(topic_set) / 100) num_topics_list.append(num_topics) ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=num_topics, id2word=dictionary, passes=20) lda_superset.append(ldamodel) print "all LDA built" # build training matrix prop_array_superset = [] for i in range(len(num_topics_list)): num_topics = num_topics_list[i] topicPropArray = np.zeros((len(doc_texts), num_topics)) for j in range(len(doc_texts)): text = doc_texts[j] textProp = lda_superset[i][dictionary_set[i].doc2bow(text)] for pair in textProp: topicIdx = pair[0] weight = pair[1] topicPropArray[j, topicIdx] = weight prop_array_superset.append(topicPropArray) # concat full feature array trainingArray = prop_array_superset[0] for i in range(len(prop_array_superset)): if i != 0: trainingArray = np.concatenate( (trainingArray, prop_array_superset[i]), axis=1) print "training matrix built" print "------------------" print "testing" # test on new data test_set = arxiv_12['astro'][0:99] + arxiv_12['cond'][0:99] + \ arxiv_12['cs'][0:99] + arxiv_12['hep'][0:99] + \ arxiv_12['math'][0:99] + arxiv_12['physics'][0:99] + \ arxiv_12['quant'][0:99] + arxiv_12['stat'][0:99] test_label = [1]*100 + [2]*100 + [3]*100 + [4]*100 + [5]*100 + \ [6]*100 + [7]*100 + [8]*100 test_texts = tokenize(test_set) # build indiv test prop array test_prop_array_superset = [] for i in range(len(num_topics_list)): num_topics = num_topics_list[i] testPropArray = np.zeros((800, num_topics)) for j in range(len(test_texts)): test = test_texts[j] testProp = lda_superset[i][dictionary_set[i].doc2bow(test)] for pair in testProp: topicIdx = pair[0] weight = pair[1] testPropArray[j, topicIdx] = weight test_prop_array_superset.append(testPropArray) # concat full test array testArray = test_prop_array_superset[0] for i in range(len(test_prop_array_superset)): if i != 0: testArray = np.concatenate( (testArray, test_prop_array_superset[i]), axis=1) cla = svm.SVC(kernel='linear') X_train, X_test, y_train, y_test = trainingArray, testArray, label_set, test_label print "training_array length: " + str(len(topicPropArray)) print "test_array length: " + str(len(testPropArray)) print "training_label length: " + str(len(label_set)) print "test_label length: " + str(len(test_label)) print '--------------------------------' # ova # gnb gnb = GaussianNB() cla = OneVsOneClassifier(gnb) cla.fit(X_train, y_train) predictions = cla.predict(X_test) np.savetxt('ecocovosub_pred.csv', predictions.astype(int), fmt='%i', delimiter=",") # print predictions print 'ecoc gnb' print zero_one_loss(predictions, y_test) print '--------------------------------' svmlin = svm.SVC(kernel='linear') cla = OneVsOneClassifier(svmlin) cla.fit(X_train, y_train) predictions = cla.predict(X_test) np.savetxt('ecocovosubsvm_pred.csv', predictions.astype(int), fmt='%i', delimiter=",") # print predictions print 'ecoc svm' print zero_one_loss(predictions, y_test) print '--------------------------------'
def training(train_data, dev_data, param): """ Train the model on train_data and generate prediction on train_data, dev_data :param train_data: dictionary containing data, encoded label and binary label :param dev_data: dictionary containing data, encoded label and binary label :param param: parameter for training :return: train_prediction: prediction of training data dev_prediction: prediction of development data train_vec.shape: shape of training vector (sample size, feature size) dev_vec.shape: shape of development vector (sample size, feature size) model: trained classifier word_vec_map: learned tfidf/count vectorizer """ text_to_vec = TextToVec(**param) # Fit with both train and dev data text_to_vec.fit(train_data['data'] + dev_data['data']) word_vec_map = text_to_vec.vectorizer.get_feature_names() train_vec = text_to_vec.transform(train_data['data']) dev_vec = text_to_vec.transform(dev_data['data']) logger.info( f"train vec size:{train_vec.shape}, dev vec size:{dev_vec.shape}") # # apply weights on tfidf based on whether the word appear in multiple classes # tt_occ = Counter(train_data['encoded_label']) # weight_list = [] # for i in range(train_vec.shape[1]): # For every feature # occ = Counter(train_data['encoded_label'][train_vec[:, i] > 0.0]) # for key, value in occ.items(): # occ[key] = value/tt_occ[key] # weight_list.append(np.std(list(occ.values()))/0.35) # weight = np.array(weight_list).reshape(1, -1) # weight = weight/np.max(weight) # train_vec = np.multiply(train_vec, weight) # Perform oversampling on training data if param['balanced'] not in ['Bootstrap', 'Handsample']: logger.info( f"class info before resampling: {sorted(Counter(train_data['encoded_label']).items())}" ) train_vec, train_data['encoded_label'] = resample( X_train=train_vec, y_train=train_data['encoded_label'], balance=param['balanced']) logger.info( f"class info after resampling:{sorted(Counter(train_data['encoded_label']).items())}" ) # Fit model if param['classifier'] == 'MultinomialNB': clf = MultinomialNB() elif param['classifier'] == 'LDA': clf = LinearDiscriminantAnalysis() else: clf = svm.LinearSVC() if param['multiclass'] == 'OnevsOne': model = OneVsOneClassifier(clf) else: model = OneVsRestClassifier(clf) if param['classifier'] == 'LinearSVM' or param['multiclass'] == 'OnevsOne': logger.info(f'Fitting model: {param}') model = model.fit(train_vec, train_data['encoded_label']) train_prediction = model.predict(train_vec) dev_prediction = model.predict(dev_vec) else: logger.info(f'Fitting model: {param}') model = model.fit(train_vec, train_data['binary_label']) train_prediction = np.argmax(model.predict(train_vec), axis=1) dev_prediction = np.argmax(model.predict(dev_vec), axis=1) return train_prediction, dev_prediction, train_vec.shape, dev_vec.shape, model, word_vec_map
y_pred_2_threshhold = (y_score > 70000) # Multi-class classification # By defualt, most binary classfiers when given a multi-class # problem, use OvA strategy, except a few like SVMs which use OVO # as it is faster. We can also explicitly specify to Sklearn which # strategy to use. multi_model = SGDClassifier() multi_model.fit(X_train, y_train) # this will use OvA # In reality, this will use 10 different Binary classifiers multi_scores = multi_model.decision_function([some_digit]) # This will store an array of 10 scores in scores_multi # The one with the highest index will be selected by predict_ method # To explicityly use specified methiods OvO_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) OvO_clf.fit(X_train, y_train) # Random-Forest Classifer rf_clf = RandomForestClassifier() rf_clf.fit(X_train, y_train) rf_clf.predict_proba([some_digit]) y_pred_multi = cross_val_predict(multi_model, X_train, y_train, cv=3) conf_mat = confusion_matrix(y_train, y_pred_multi) plt.plot(conf_mat, cmap=plt.cm.gray) # plt.show() # The row represents the actual classes while the col # represents the predicted classes. # Multi-Label Classification -- classifying multiple # binary labels on one input # Creating a new label array
result2 = np.append(c, d.astype(float), axis=1) #print(result) #np.savetxt("reslut.csv",np.array(clf.fit(X,Y).predict(test)).astype(int),delimiter=",") with open("logloss.csv", "wb") as f: f.write( b'Sample_id,Class_1,Class_2,Class_3,Class_4,Class_5,Class_6,Class_7,Class_8,Class_9,Class_10\n' ) np.savetxt(f, result2, delimiter=",") # Split the data into a training set and a test set X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state=0) #Multiclass learning using OvO and linearSVC OvOclf = OneVsOneClassifier(LinearSVC(random_state=0)) a = np.reshape(np.asarray(list(range(1, len(test) + 1))), (6544, 1)) b = np.reshape(np.asarray(OvOclf.fit(X_train, Y_train).predict(test)), (6544, 1)) OvOclf.fit(X, Y).predict(X) y_pred = OvOclf.fit(X_train, Y_train).predict(X_test) def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. """ if normalize:
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin): """Algorithm for robust classification using reweighting algorithm. This model use iterative reweighting of samples to make a regression or classification estimator robust. The principle of the algorithm is to use an empirical risk minimization principle where the risk is estimated using a robust estimator (for example Huber estimator or median-of-means estimator)[1], [3]. The idea behind this algorithm was mentioned before in [2]. This idea translates in an iterative algorithm where the sample_weight are changed at each iterations and are dependent of the sample. Informally the outliers should have small weight while the inliers should have big weight, where outliers are sample with a big loss function. This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily bad outliers). When the "mom" weighting scheme is used, k outliers can be tolerated. When the "Huber" weighting scheme is used, asymptotically the number of outliers has to be less than half the sample size. Read more in the :ref:`User Guide <robust>`. Parameters ---------- weighting : string, default="huber" Weighting scheme used to make the estimator robust. Can be 'huber' for huber-type weights or 'mom' for median-of-means type weights. max_iter : int, default=100 Maximum number of iterations. For more information, see the optimization scheme of base_estimator and the eta0 and burn_in parameter. burn_in : int, default=10 Number of steps used without changing the learning rate. Can be useful to make the weight estimation better at the beginning. eta0 : float, default=0.01 Constant step-size used during the burn_in period. Used only if burn_in>0. Can have a big effect on efficiency. c : float>0 or None, default=None Parameter used for Huber weighting procedure, used only if weightings is 'huber'. Measure the robustness of the weighting procedure. A small value of c means a more robust estimator. Can have a big effect on efficiency. If None, c is estimated at each step using half the Inter-quartile range, this tends to be conservative (robust). k : int < sample_size/2, default=1 Parameter used for mom weighting procedure, used only if weightings is 'mom'. 2k+1 is the number of blocks used for median-of-means estimation, higher value of k means a more robust estimator. Can have a big effect on efficiency. If None, k is estimated using the number of points distant from the median of means of more than 2 times a robust estimate of the scale (using the inter-quartile range), this tends to be conservative (robust). loss : string, None or callable, default="log" Name of the loss used, must be the same loss as the one optimized in base_estimator. Classification losses supported : 'log', 'hinge'. If 'log', then the base_estimator must support predict_proba. Regression losses supported : 'squared_loss', . sgd_args : dict, default={} arguments of the SGDClassifier base estimator. multi_class : string, default="ovr" multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr" for OneVsRestClassifier or "binary" for binary classification. n_jobs : int, default=1 number of jobs used in the multi-class meta-algorithm computation. tol : float or None, (default = 1e-3) The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs. n_iter_no_change : int, default=10 Number of iterations with no improvement to wait before early stopping. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Attributes ---------- classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. Only available if multi_class = "binary" intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. Only available if multi_class = "binary" n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. base_estimator_ : object, The fitted base estimator SGDCLassifier. weights_ : array like, length = n_sample. Weight of each sample at the end of the algorithm. Can be used as a measure of how much of an outlier a sample is. Only available if multi_class = "binary" Notes ----- Often, there is a need to use RobustScaler as preprocessing. Examples -------- >>> from sklearn_extra.robust import RobustWeightedClassifier >>> from sklearn.datasets import make_blobs >>> import numpy as np >>> rng = np.random.RandomState(42) >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]), ... random_state=rng) >>> clf=RobustWeightedClassifier() >>> _ = clf.fit(X, y) >>> score = np.mean(clf.predict(X)==y) References ---------- [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu. "Robust classification via MOM minimization", Mach Learn 109, (2020). https://doi.org/10.1007/s10994-019-05863-6 (2018). arXiv:1808.03106 [2] Christian Brownlees, Emilien Joly and Gábor Lugosi. "Empirical risk minimization for heavy-tailed losses", Ann. Statist. Volume 43, Number 6 (2015), 2507-2536. [3] Stanislav Minsker and Timothée Mathieu. "Excess risk bounds in robust empirical risk minimization" arXiv preprint (2019). arXiv:1910.07485. """ def __init__( self, weighting="huber", max_iter=100, burn_in=10, eta0=0.01, c=None, k=0, loss="log", sgd_args=None, multi_class="ovr", n_jobs=1, tol=1e-3, n_iter_no_change=10, random_state=None, ): self.weighting = weighting self.max_iter = max_iter self.burn_in = burn_in self.eta0 = eta0 self.c = c self.k = k self.loss = loss self.sgd_args = sgd_args self.multi_class = multi_class self.n_jobs = n_jobs self.tol = tol self.n_iter_no_change = n_iter_no_change self.random_state = random_state def fit(self, X, y): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). Returns ------- self : returns an estimator trained with RobustWeightedClassifier. """ if self.sgd_args is None: sgd_args = {} else: sgd_args = self.sgd_args # Define the base estimator base_robust_estimator_ = _RobustWeightedEstimator( SGDClassifier(**sgd_args, loss=self.loss), weighting=self.weighting, loss=self.loss, burn_in=self.burn_in, c=self.c, k=self.k, eta0=self.eta0, max_iter=self.max_iter, tol=self.tol, n_iter_no_change=self.n_iter_no_change, random_state=self.random_state, ) if self.multi_class == "ovr": self.base_estimator_ = OneVsRestClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) elif self.multi_class == "binary": self.base_estimator_ = base_robust_estimator_ elif self.multi_class == "ovo": self.base_estimator_ = OneVsOneClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) else: raise ValueError("No such multiclass method implemented.") self.base_estimator_.fit(X, y) if self.multi_class == "binary": self.weights_ = self.base_estimator_.weights_ self.coef_ = self.base_estimator_.coef_ self.intercept_ = self.base_estimator_.intercept_ self.n_iter_ = self.max_iter * len(X) self.classes_ = self.base_estimator_.classes_ return self def predict(self, X): """Predict using the estimator trained with RobustWeightedClassifier. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y : array-like, shape (n_samples, n_outputs) The predicted values. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.predict(X) def _check_proba(self): if self.loss != "log": raise AttributeError( "Probability estimates are not available for" " loss=%r" % self.loss ) @property def predict_proba(self): """ Probability estimates when binary classification. Parameters ---------- X : array-like of shape (n_samples, n_features) Vector to be scored, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- T : array-like of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model, """ check_is_fitted(self, attributes=["base_estimator_"]) self._check_proba() return self._predict_proba def _predict_proba(self, X): return self.base_estimator_.predict_proba(X) @property def _estimator_type(self): return self.base_estimator._estimator_type def score(self, X, y=None): """Returns the score on the given data, using ``base_estimator_.score``. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) or (n_samples,), optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.score(X, y) def decision_function(self, X): """Predict using the linear model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.decision_function(X)
class LSVMDetector: # just the training() function changes, rest all remains same. def __init__(self, subjects, data, attacker_data): self.data = data self.attacker = attacker_data self.u_scores = [] self.i_scores = [] self.mean_vector = [] self.subjects = subjects self.fp = [] def training(self): self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto')) labels = [0] * len(self.train) + [1] * len(self.train_imposter) self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels) def testing(self): self.u_scores = self.clf.decision_function(self.test_genuine) self.i_scores = self.clf.decision_function(self.test_imposter) self.u_scores = list(self.u_scores) self.i_scores = list(self.i_scores) def evaluate(self): eers = [] fpr = [] if isinstance(self.subjects, list): for idx, subject in enumerate(self.subjects): genuine_user_data = self.data.loc[self.data.user_id == subject, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[self.data.user_id != subject, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = normalize_np(self.attacker[idx]) # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \ # head(10).loc[:, # ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', # 'length of trajectory', 'mid-stroke pressure', # 'mid-stroke area covered', # '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', # '20\%-perc. dev. from end-to-end line', # '50\%-perc. dev. from end-to-end line', # '80\%-perc. dev. from end-to-end line']]) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker[idx] self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) # print(evaluateFAR(self.u_scores, self.i_scores)) else: genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[ self.data.user_id != self.subjects, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = imposter_data.groupby("subject"). \ # tail(6).loc[:, "H.period":"H.Return"] # self.test_imposter = normalize_np(self.attacker) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) return np.mean(fpr)
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of Gaussian Processes for Machine Learning (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer : int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict : int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default : "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_ : float The log-marginal-likelihood of ``self.kernel_.theta`` classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data .. versionadded:: 0.18 """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel( [estimator.kernel_ for estimator in self.base_estimator_.estimators_]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of ``self.kernel_.theta`` is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelihood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean( [estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators)]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean( [estimator.log_marginal_likelihood( theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators)]) else: raise ValueError("Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
# numpy ndarray tr = np.genfromtxt(tr_path, delimiter=' ') ts = np.genfromtxt(ts_path, delimiter=' ') tr_feat = tr[:, 1:] ts_feat = ts[:, 1:] tr_label = tr[:, 0] ts_label = ts[:, 0] # use sklearn C-Support Vector Classification ## == one-vs-one == ## # The multiclass support is handled in a one-vs-one scheme # train ovo_clf = OneVsOneClassifier(LinearSVC()) ovo_clf.fit(tr_feat, tr_label) # predict ovo_pred = ovo_clf.predict(ts_feat) ovo_err = 1 - ovo_clf.score(ts_feat, ts_label) # confusion matrix # #array([[159, 7], # [ 5, 161]]) ovo_cmat = metrics.confusion_matrix(ts_label, ovo_pred) pred_total = np.sum(ovo_cmat, axis=1) ovo_mis = 1 - np.diag(ovo_cmat).astype(float) / pred_total print("one vs. one svm - classification err: %s \n" % (ovo_err)) print("confusion matrix: \n %s" % (ovo_cmat)) print("class misclassification rate : \n %s" % (ovo_mis))
print 79 * "_" print 'OvO', cv_scores_ovo.mean() print 'OvA', cv_scores_ova.mean() plt.figure(figsize=(4, 3)) plt.boxplot([cv_scores_ova, cv_scores_ovo]) plt.xticks([1, 2], ['One vs All', 'One vs One']) plt.title('Prediction: accuracy score') ### Plot a confusion matrix ################################################### # Fit on the the first 10 sessions and plot a confusion matrix on the # last 2 sessions from sklearn.metrics import confusion_matrix svc_ovo.fit(X[session < 10], y[session < 10]) y_pred_ovo = svc_ovo.predict(X[session >= 10]) plt.matshow(confusion_matrix(y_pred_ovo, y[session >= 10])) plt.title('Confusion matrix: One vs One') plt.xticks(np.arange(len(unique_conditions)), unique_conditions) plt.yticks(np.arange(len(unique_conditions)), unique_conditions) svc_ova.fit(X[session < 10], y[session < 10]) y_pred_ova = svc_ova.predict(X[session >= 10]) plt.matshow(confusion_matrix(y_pred_ova, y[session >= 10])) plt.title('Confusion matrix: One vs All') plt.xticks(np.arange(len(unique_conditions)), unique_conditions) plt.yticks(np.arange(len(unique_conditions)), unique_conditions)
async def do_run_async(self): training_set = super().load_train_images() # Training set needs to be reshaped from 3D (60000,28,28) to 2D (60000, 784) for the classifier to be able to # use in training phase training_set_tr = training_set.reshape((60000, 784)) training_labels = super().load_train_labels() # Scikit-learn is smart enough to detect when you try to use a binary classification algorithm # such as SGD on a multiclass classification task (when the labels are not binary) and automatically runs OvA # strategy (trains N binary classifiers, one for each class) except for SVM for which it runs OvO # (trains N x (N-1)/2 binary classifiers, one between 0 and 1, one between 1 and 2 etc) sgd_classifier = SGDClassifier(random_state=77) sgd_classifier.fit(training_set_tr, training_labels) seven = super().get_random_digit(training_set_tr, training_labels, 7) print(f"The digit is:{sgd_classifier.predict([seven])}") # Get the classifier to return the decision scores for each class rather than a prediction # The class with the higher score is used for prediction scores = sgd_classifier.decision_function([seven]) print(f"The decision scores for the digit are:{scores}") # Can also force Scikit-Learn to use the SGDClassifier with OvO strategy ovo = OneVsOneClassifier(sgd_classifier) ovo.fit(training_set_tr, training_labels) print("OvO: The digit is:", ovo.predict([seven])) # Random Forest algorithm can also be used for classification (besides regression - RandomForestRegressor) # and is a multiclass algorithm so no need for OvA or OvO strategies rnd_forest = RandomForestClassifier() rnd_forest.fit(training_set_tr, training_labels) print(f"Random Forest: The digit is:{rnd_forest.predict([seven])}") print( f"Random Forest: Probabilities:{rnd_forest.predict_proba([seven])}" ) # Evaluate SGD Classifier vs Random Forest based on confusion matrix sgd_predictions = cross_val_predict(sgd_classifier, training_set_tr, training_labels, cv=3) rnd_forest_predictions = cross_val_predict(rnd_forest, training_set_tr, training_labels, cv=3) print("SGD Classifier Confusion Matrix:") print(confusion_matrix(training_labels, sgd_predictions)) print("Random Forest Classifier Confusion Matrix:") print(confusion_matrix(training_labels, rnd_forest_predictions)) # Random Forest generally seems better - higher values on the main diagonal # Test persisting a trained classifier rnd_forest = RandomForestClassifier() rnd_forest.fit(training_set_tr, training_labels) file = open("D:\\rnd_forest.dat", "wb") pickle.dump(rnd_forest, file) file.close() file2 = open("D:\\rnd_forest.dat", "rb") rnd_forest_2 = pickle.load(file2) file2.close() print("Random Forest Persisted: The digit is:", rnd_forest_2.predict([seven]))
def evaluation(): iris = datasets.load_iris() class_names = iris.target_names C = 1.0 # SVM regularization parameter X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.4, random_state=0) svm_c = OneVsOneClassifier(SVC(kernel='linear', C=C)) y_pred = svm_c.fit(X_train, y_train).predict(X_test) with open('output_files/svm_1v1_classification_report.tex', 'w') as out: out.write('\\begin{table}\n') out.write( report_to_latex_table( classification_report(y_test, y_pred, target_names=iris.target_names))) out.write('\n\\end{table}\n') # Compute confusion matrix cnf_matrix = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='SVM One-vs-One Confusion Matrix') plt.tight_layout() plt.savefig('images/svm_linear_1v1_cm.png') plt.close() svm_l = OneVsRestClassifier(LinearSVC(C=C)) y_pred = svm_l.fit(X_train, y_train).predict(X_test) with open('output_files/svm_1va_classification_report.tex', 'w') as out: out.write('\\begin{table}\n') out.write( report_to_latex_table( classification_report(y_test, y_pred, target_names=iris.target_names))) out.write('\n\\end{table}\n') # Compute confusion matrix cnf_matrix = confusion_matrix(y_test, y_pred) np.set_printoptions(precision=2) # Plot normalized confusion matrix plt.figure() plot_confusion_matrix(cnf_matrix, classes=class_names, normalize=True, title='SVM One-vs-All Confusion Matrix', showC=False) plt.tight_layout() plt.savefig('images/svm_linear_1va_cm.png') plt.close()
from sklearn.multiclass import OneVsOneClassifier from sklearn.preprocessing import label_binarize from sklearn.svm import LinearSVC from sklearn import metrics data_tr_r = np.loadtxt('multitest_out.csv', delimiter = ',') data_ts_r = np.loadtxt('multitrain_out.csv', delimiter = ',') data_tr = data_tr_r[:, :-1] data_ts = data_ts_r[:, :-1] label_tr = data_tr_r[:,-1] label_ts = data_ts_r[:,-1] # Learn to predict each class against one class clf = OneVsOneClassifier(LinearSVC(random_state = 0)) OvsO = clf.fit(data_tr, label_tr) result=clf.predict(data_ts) #accuracy=clf.score(data_ts,label_ts) accuracy = metrics.accuracy_score(result, label_ts) error_vector = result - label_ts error = 0 p_data= clf.fit(data_tr, label_tr).decision_function(data_ts) p_data=p_data[:,1] conf_mat=metrics.confusion_matrix(label_ts,result) precision=metrics.precision_score(label_ts, result, average = None) recall=metrics.recall_score(label_ts, result, average = None) print ("confusion_matrix:") print (conf_mat)
# Splitting the dataset into test and train s_train, s_test, t_train, t_test = train_test_split(s, t, test_size=0.33, random_state=4) #Printing shapes print(s_train.shape) print(s_test.shape) print(t_train.shape) print(t_test.shape) # One-vs-One SVM Classifier Prediction smodel = OneVsOneClassifier(SVC()).fit(s_train, t_train) smodel.fit(s_train, t_train) sprediction = smodel.predict(s_test) print(sprediction) # One-vs-Rest SVM Classifier Prediction clf = OneVsRestClassifier(SVC()).fit(s_train, t_train) spredict = clf.predict(s_test) print(spredict) # Actual values which should have been predicted based on testing dataset print(t_test) """<h1>Evaluating the classifiers</h1>""" # Accuracy for One-vs-One Classifier accuracy = metrics.accuracy_score(t_test, sprediction) print(accuracy)
scores = sgd.decision_function([some_digit]) #array([[-836368.4535247 , -461981.66956632, -660256.15197058, # -148855.65250873, -137458.04986937, -154654.76568534, # -864502.26667054, -245167.9063152 , -149510.01775103, # -233700.77221455]]) #argmax gives max values of scores np.argmax(scores) sgd.classes_ #array(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9'], dtype='<U1') from sklearn.multiclass import OneVsOneClassifier ovo = OneVsOneClassifier(SGDClassifier(random_state=100)) ovo.fit(data_train, target_train) ovo.predict([some_digit]) len(ovo.estimators_) ovo.decision_function([some_digit]) #array([[ 1.5 , 4.01086892, 0.50210079, 5.22484016, 8.31545536, # 5.11411311, -0.43998285, 5.13308383, 7.3219439 , 8.3175768 ]]) cross_val_score(sgd, data_train, target_train, cv=3, scoring='accuracy') #array([0.86552689, 0.86179309, 0.86117918]) import pandas as pd predict_m = cross_val_predict(sgd, data_train, target_train, cv=3) ps = precision_score(target_train, predict_m, average=None)
def my_HKNNSVM(X_train, X_test, Y_train, K_Neighbors, Kernel_SVM): train = X_train train = normalize(train) test = X_test test = normalize(test) kelas = Y_train k = K_Neighbors kernel = Kernel_SVM hasilkelas = [] Y_pred = [] for z in range(0, len(test)): distance = [] train = numpy.array(train) test = numpy.array(test) index_train = numpy.arange(len(train)) index_train = index_train.tolist() length = len(train) for i in range(0, length): distance.append( (math.sqrt(sum([(a - b)**2 for a, b in zip(train[i], test[z]) ])), kelas[i], tuple(train[i]), kelas[i])) distance.sort(key=operator.itemgetter(0)) neighbor = [] ttg = [] kelasttg = [] jarak = [] for j in range(k): neighbor.append(distance[j]) ttg.append(neighbor[j][2]) kelasttg.append(neighbor[j][1]) jarak.append(distance[0]) ttg = list(ttg) classVotes = {} for a in range(len(neighbor)): response = neighbor[a][1] if response in classVotes: classVotes[response] += 1 else: classVotes[response] = 1 #sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True) items = [] items = list(classVotes.items()) #print(classVotes) '''svm''' if len(items) > 1: clf = OneVsOneClassifier(SVC(kernel=kernel)) clf.fit(list(ttg), list(kelasttg)) ley = [list(test[z])] hasilkelas = clf.predict(ley) #print(hasilkelas) else: hasilkelas = max(classVotes.items(), key=operator.itemgetter(1))[0] #print(hasilkelas) hasilkelas = numpy.reshape(hasilkelas, (1, )) hasilkelas = numpy.array(hasilkelas) '''svm''' Y_pred.append(hasilkelas) return Y_pred
class SeCoEstimator(BaseEstimator, ClassifierMixin): """A classifier using rules learned with the *Separate-and-Conquer* (SeCo) algorithm, also known as *Covering* algorithm. Wraps `_BaseSeCoEstimator` to handle multi-class problems, selecting a multi-class strategy and making sure that `_BaseSeCoEstimator` always sees an integer range [0..n_classes_) of class labels, where 0 is the intended fallback class; i.e. the biggest class in multi-class problems, or the negative class when learning a binary concept. The concrete SeCo variant to run is defined by `algorithm_config`. Fields ----- algorithm_config : subclass of SeCoAlgorithmConfiguration Defines the concrete SeCo algorithm to run, see :class:`SeCoAlgorithmConfiguration`. Parameters ----- multi_class : callable or str or None Which strategy to use for non-binary problems. Possible values: - None: auto-select; use 'direct' if possible (`algorithm_config.direct_multiclass_support()` returns True), 'one_vs_rest' otherwise. - A callable: Construct `self.base_estimator_ = multi_class(_BaseSeCoEstimator())` and delegate to that estimator. Useful if you want to roll a different binarization strategy, e.g. >>> import sklearn.multiclass, functools >>> multi_class=functools.partial( ... sklearn.multiclass.OutputCodeClassifier, ... code_size=0.7, random_state=42) If you use this, make sure to pass to `_BaseSeCoEstimator` classes `y` from an integer range [0..n_classes_), e.g. using `LabelEncoder`. Also be aware of class order influence on tie-breaking. - 'direct': Directly learn a theory of rules with different heads (target classes). Uses :class:`BySizeLabelEncoder` internally. - 'one_vs_rest': Use `sklearn.multiclass.OneVsRestClassifier` for class binarization and learn binary theories. - 'one_vs_one': Use `sklearn.multiclass.OneVsOneClassifier` for class binarization and learn binary theories. - TODO: multi_class strategy of ripper: OneVsRest, remove C_i after learning rules for it random_state : None | int | instance of np.random.RandomState RNG, may be used by the algorithm. Value passed through `sklearn.utils.check_random_state`. n_jobs : int, optional Passed to `OneVsRestClassifier` or `OneVsOneClassifier` if these are used. Attributes ----- base_estimator_ : estimator instance The estimator object that all tasks are delegated to. One of `sklearn.multiclass.OneVsRestClassifier`, `sklearn.multiclass.OneVsOneClassifier` or `sklearn_seco.util.TargetTransformingMetaEstimator` if demanded by the `multi_class_` strategy, a `_BaseSeCoEstimator` otherwise. multi_class_ : callable or str The actual strategy used on a non-binary problem. Relevant if `multi_class=None` demanded auto-selection. classes_ : np.ndarray `np.unique(y)` See Also ----- `_BaseSeCoEstimator` """ algorithm_config: Type[SeCoAlgorithmConfiguration] # TODO: _BaseSeCoEstimator.export_text equivalent inverting binarization & target transformation for display def _more_tags(self): # tell sklearn >= 0.21 that we can handle categorical data return {'X_types': ['2darray', 'categorical'], 'allow_nan': True} def __init__(self, multi_class=None, random_state=1, n_jobs=1): self.multi_class = multi_class self.random_state = random_state self.n_jobs = n_jobs def fit(self, X, y, **kwargs): """Learn SeCo theory/theories on training data `X, y`. For possible parameters (`**kwargs`), refer to :class:`_BaseSeCoEstimator`. """ X, y = check_X_y(X, y, force_all_finite='allow-nan') self.multi_class_ = self.multi_class self.base_estimator_ = _BaseSeCoEstimator( self.algorithm_config, random_state=self.random_state, **kwargs) # NOTE: if using multiprocessing (e.g. through OvO or OvR), all # sub-estimators share the same random seed/state. # I think this should not harm. def wrapper_ordering_classes_by_size(estimator): # BySizeLabelEncoder ensures: first class = default = biggest # and that classes form an integer range [0..n_classes_) return TargetTransformingMetaEstimator(BySizeLabelEncoder(), estimator) self.classes_ = np.unique(y) n_classes_ = self.classes_.size if n_classes_ == 1: raise ValueError("SeCoEstimator requires 2 or more distinct " "classes. Only 1 class (%s) present." % self.classes_[0]) elif n_classes_ == 2: self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: # n_classes_ > 2 if self.multi_class_ is None: # default / auto-selection if self.algorithm_config.direct_multiclass_support(): self.multi_class_ = "direct" else: self.multi_class_ = "one_vs_rest" if callable(self.multi_class_): self.base_estimator_ = self.multi_class_(self.base_estimator_) elif self.multi_class_ == "one_vs_rest": self.base_estimator_ = OneVsRestClassifier( self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "one_vs_one": self.base_estimator_ = OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class_ == "direct": # TODO: if self.multi_class=='direct' (not `None` auto-detect), only assertion prevents binary-only learner to silently learn on multiclass training data self.base_estimator_ = wrapper_ordering_classes_by_size( self.base_estimator_) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class_) # NOTE: param categorical_features is data dependent, but OvR/OvO don't # pass extra parameters through fit(), so it has to be in # `_BaseSeCoEstimator.__init__`. self.base_estimator_.fit(X, y) return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from ``classes_`` """ check_is_fitted(self, ["classes_"]) X = check_array(X, force_all_finite='allow-nan') return self.base_estimator_.predict(X) @if_delegate_has_method('base_estimator_') def predict_proba(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.predict_proba(X) @if_delegate_has_method('base_estimator_') def decision_function(self, X): # noinspection PyUnresolvedReferences return self.base_estimator_.decision_function(X) def get_seco_estimators(self) -> Sequence[_BaseSeCoEstimator]: """ :return: The `_BaseSeCoEstimator` instances that were trained. Depending on the multi-class strategy, the class labels they use differ in order and value. Cannot be used when self.multi_class_ is a callable. """ check_is_fitted(self, 'base_estimator_') is_binary = len(self.classes_) == 2 if is_binary or self.multi_class_ == "direct": assert isinstance(self.base_estimator_, TargetTransformingMetaEstimator) return [self.base_estimator_.estimator] elif self.multi_class_ == "one_vs_rest": assert isinstance(self.base_estimator_, OneVsRestClassifier) return self.base_estimator_.estimators_ elif self.multi_class_ == "one_vs_one": assert isinstance(self.base_estimator_, OneVsOneClassifier) return self.base_estimator_.estimators_ else: assert False, "invalid state: unknown type of base_estimator_ " \ f"({str(self.base_estimator_)})"
X = np.array(X) label_encoder = [] X_encoded = np.empty(X.shape) for i, item in enumerate(X[0]): if item.isdigit(): X_encoded[:, i] = X[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i]) X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(X_train, y_train) y_test_pred = classifier.predict(X_test) f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3) print('F1 score : ' + str(round(100 * f1.mean(), 2)) + '%') input_data = [ '37', 'Private', '215646', 'HS-grad', '9', 'Never-married', 'Handlers-cleaners', 'Not-in-family', 'White', 'Male', '0', '0', '40',