def train_classifier(clf,X_train,y_train,X_test,y_test): clf = OneVsOneClassifier(clf) clf.fit(X_train, y_train) train_time = time() - t0 print("train time: %0.3fs" % train_time) t0 = time() return clf
def svm_classification(genres, features_type): training_set_features = tf.read_features_from_files("../../music/training", genres, features_type) testing_set_features = tf.read_features_from_files("../../music/testing", genres, features_type) X = [] y = [] for feature in training_set_features: (mean, cov_mat, genre_name) = feature X.append(mean.tolist()) y.append(tf.get_genre_ID(genre_name)) training_data = np.array(X) training_class = np.array(y) X = [] y = [] for feature in testing_set_features: (mean, cov_mat, genre_name) = feature X.append(mean.tolist()) y.append(tf.get_genre_ID(genre_name)) testing_data = np.array(X) testing_class = np.array(y) clf = OneVsOneClassifier(SVC(kernel='linear')) result_class = np.array(clf.fit(training_data, training_class).predict(testing_data)) rt.print_accuracy(list(testing_class), list(result_class), genres, features_type, "svm") rt.write_accuracy_to_file("../../music/", list(testing_class), list(result_class), genres, features_type, "svm")
def svm_training(train_X,train_Y,kernel): if kernel == False: clf = OneVsOneClassifier(svm.LinearSVC(random_state=0)) else: clf = OneVsOneClassifier(svm.SVC(kernel='rbf')) clf.fit(train_X,train_Y) return clf
def test_ovo_ties(): # test that ties are broken using the decision function, not defaulting to # the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) # recalculate votes to make sure we have a tie predictions = np.vstack([clf.predict(X) for clf in multi_clf.estimators_]) scores = np.vstack([clf.decision_function(X) for clf in multi_clf.estimators_]) # classifiers are in order 0-1, 0-2, 1-2 # aggregate votes: votes = np.zeros((4, 3)) votes[np.arange(4), predictions[0]] += 1 votes[np.arange(4), 2 * predictions[1]] += 1 votes[np.arange(4), 1 + predictions[2]] += 1 # for the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # for the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # for the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], 0) # in the zero-one classifier, the score for 0 is greater than the score for # one. assert_greater(scores[0][0], scores[0][1]) # score for one is greater than score for zero assert_greater(scores[2, 0] - scores[0, 0], scores[0, 0] + scores[1, 0]) # score for one is greater than score for two assert_greater(scores[2, 0] - scores[0, 0], -scores[1, 0] - scores[2, 0])
def gen_svc(train_model): '''Given a training model, generates the SVM (and DictVectorizer) for it Args: train_model: a training model object. should have 2 attributes: feature_lists, a map from POS tag to a dictionary of features (the ones used in the ith decision), and action_lists, a map from POS tag to the action (Shift, Left, Right) chosen for the ith decision Returns: dictionary mapping POS tag to a vectorizer, SVM tuple Raises: None ''' models = {} for pos_tag in train_model.feature_lists: vec = DictVectorizer() feature_mat = vec.fit_transform(train_model.feature_lists[pos_tag]) trained_svc = OneVsOneClassifier(LinearSVC()) try: trained_svc.fit(feature_mat, np.array(train_model.action_lists[pos_tag])) except ValueError: # occasionally we get the same action for everything with a # particular POS, which raises an error. so in that case we just # use a custom class that always predicts the same action trained_svc = AlwaysPredict(train_model.feature_lists[pos_tag][0]) models[pos_tag] = (vec, trained_svc) return models
def gen_svc(train_model): '''Given a training model, generates the SVM (and DictVectorizer) for it''' vec = DictVectorizer() feature_mat = vec.fit_transform(train_model.feature_list) # for some reason just SVC() seems to always suggest "Shift" trained_svc = OneVsOneClassifier(LinearSVC()) trained_svc.fit(feature_mat, np.array(train_model.action_list)) return vec, trained_svc
def test_ovo_fit_on_list(): # Test that OneVsOne fitting works with a list of targets and yields the # same output as predict from an array ovo = OneVsOneClassifier(LinearSVC(random_state=0)) prediction_from_array = ovo.fit(iris.data, iris.target).predict(iris.data) prediction_from_list = ovo.fit(iris.data, list(iris.target)).predict(iris.data) assert_array_equal(prediction_from_array, prediction_from_list)
def test_ovo_string_y(): # Test that the OvO doesn't mess up the encoding of string labels X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) ovo = OneVsOneClassifier(LinearSVC()) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def test_ovo_string_y(): "Test that the OvO doesn't screw the encoding of string labels" X = np.eye(4) y = np.array(['a', 'b', 'c', 'd']) svc = LinearSVC() ovo = OneVsOneClassifier(svc) ovo.fit(X, y) assert_array_equal(y, ovo.predict(X))
def OneVsOne(inputs_train, inputs_valid, target_train, target_valid): name = "Multiclass One Vs One" clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(inputs_train, np.ravel(target_train)) prediction = clf.predict(inputs_valid) correct = np.count_nonzero(np.ravel(target_valid) == prediction) total = target_valid.shape[0] correctRate = (float(correct)/total)*100 return name, correctRate
def test_ovo_fit_predict(): # A classifier which implements decision_function. ovo = OneVsOneClassifier(LinearSVC()) pred = ovo.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2) # A classifier which implements predict_proba. ovo = OneVsOneClassifier(MultinomialNB()) pred = ovo.fit(iris.data, iris.target).predict(iris.data) assert_equal(len(ovo.estimators_), n_classes * (n_classes - 1) / 2)
def __init__(self, estimator, n_jobs=-1, n_neighbors=18, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, threshold=0.2, metric_params=None): OneVsOneClassifier.__init__(self, estimator, n_jobs) self.nbrs = NearestNeighbors(n_neighbors=n_neighbors, radius=radius, algorithm=algorithm, leaf_size=leaf_size, metric=metric, p=p, metric_params=metric_params, n_jobs=n_jobs) self.n_neighbors = n_neighbors self.threshold = threshold self._fit_y = None
def test_ovo_ties2(): # test that ties can not only be won by the first two labels X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y_ref = np.array([2, 0, 1, 2]) # cycle through labels so that each label wins once for i in range(3): y = (y_ref + i) % 3 multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) assert_equal(ovo_prediction[0], i % 3)
def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes; got %d class (only class %s " "is present)" % (self.n_classes_, self.classes_[0])) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean( [estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self
def test_multicluster(self): c = BinaryTiloClassifier(PinchRatioCutStrategy(), similarity.Gaussian()) ##c = BinaryTiloClassifier(similarity.KNN()) ##mcc = OneVsRestClassifier(c) mcc = OneVsOneClassifier(c) data = self.three_class_pts classes = self.three_class_labels peturbed_data = data + 0.01 * np.random.random(data.shape) fitted = mcc.fit(peturbed_data, classes) guesses = fitted.predict(peturbed_data) assert_array_equal(guesses, classes)
def svm(X,Y): X_train = np.array([x for i, x in enumerate(X) if i % 7 != 0], dtype = np.uint8) y_train = np.array([z for i, z in enumerate(Y) if i % 7 != 0], dtype = np.uint8) X_test = np.array([x for i, x in enumerate(X) if i % 10 == 0], dtype = np.uint8) y_test = np.array([z for i, z in enumerate(Y) if i % 10 == 0], dtype = np.uint8) clf = OneVsOneClassifier(LinearSVC(random_state=0)) clf.fit(X_train, y_train) y_predicted = rf.predict(X_test) results = [prediction == truth for prediction, truth in zip(y_predicted, y_test)] accuracy = float(results.count(True)) / float(len(results)) print accuracy
def test_pairwise_indices(): clf_precomputed = svm.SVC(kernel="precomputed") X, y = iris.data, iris.target ovr_false = OneVsOneClassifier(clf_precomputed) linear_kernel = np.dot(X, X.T) ovr_false.fit(linear_kernel, y) n_estimators = len(ovr_false.estimators_) precomputed_indices = ovr_false.pairwise_indices_ for idx in precomputed_indices: assert_equal(idx.shape[0] * n_estimators / (n_estimators - 1), linear_kernel.shape[0])
class ClassifierOvOAsFeatures: """ A transformation that esentially implement a form of dimensionality reduction. This class uses a fast SGDClassifier configured like a linear SVM to produce a vector of decision functions separating target classes in a one-versus-rest fashion. It's useful to reduce the dimension bag-of-words feature-set into features that are richer in information. """ def fit(self, X, y): """ `X` is expected to be an array-like or a sparse matrix. `y` is expected to be an array-like containing the classes to learn. """ self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y)) return self def transform(self, X, y=None): """ `X` is expected to be an array-like or a sparse matrix. It returns a dense matrix of shape (n_samples, m_features) where m_features = (n_classes * (n_classes - 1)) / 2 """ return self.classifier.decision_function(X)
def fit(self, X, y): """ `X` is expected to be an array-like or a sparse matrix. `y` is expected to be an array-like containing the classes to learn. """ self.classifier = OneVsOneClassifier(SGDClassifier(),n_jobs=-1).fit(X,numpy.array(y)) return self
def trainOneVsOne2( histograms ): xAll = convertToSvmFormatFeature(histograms) scaleParam = computeScaleParameters(xAll) scaleFeatureData(xAll,scaleParam) xAll = np.array(xAll) yAll = [ x['label'] for x in histograms ] yAll = np.array(yAll) # svm = OneVsOneClassifier(LinearSVC(random_state=0,dual=svm_conf['dual'],C=svm_conf['C'])) gammaBase = 1.0/kmeans_conf['K'] # svm = OneVsOneClassifier(sklearn.svm.SVC(C=100, gamma=10*gammaBase,kernel='rbf')) svm = OneVsOneClassifier(sklearn.svm.SVC(C=1000, gamma=gammaBase,kernel='sigmoid')) svm.fit(xAll,yAll) out = {'scaleParam':scaleParam,'svm':svm} return out
def multiclassSVC(classifier, sz=2000): mnsize = sz df = hw6u.load_mnist_features(mnsize) data = utils.pandas_to_data(df) k = 10 all_folds = hw3u.partition_folds(data, k) kf_train, kf_test = dl.get_train_and_test(all_folds, 0) y, X = hw4u.split_truth_from_data(kf_train, replace_zeros=False) y, X = np.asarray(y), np.asarray(X) y_test, X_test = hw4u.split_truth_from_data(kf_test, replace_zeros=False) y_test, X_test = np.asarray(y_test), np.asarray(X_test) print 'Beginning analysis: {}'.format(X.shape) #clf = OneVsRestClassifier(classifier, n_jobs=4).fit(X, y) clf = OneVsOneClassifier(classifier).fit(X, y) #clf = OutputCodeClassifier(LinearSVC(random_state=0), code_size=10, random_state=0).fit(np.asarray(X), y) y_pred = clf.predict(X) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(y_pred), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test)))) print 'train acc: {} test acc: {}'.format(accuracy_score(fix_y(clf.predict(X)), fix_y(y)), accuracy_score(fix_y(y_test), fix_y(clf.predict(X_test))))
def test_ovo_ties(): # Test that ties are broken using the decision function, # not defaulting to the smallest label X = np.array([[1, 2], [2, 1], [-2, 1], [-2, -1]]) y = np.array([2, 0, 1, 2]) multi_clf = OneVsOneClassifier(Perceptron()) ovo_prediction = multi_clf.fit(X, y).predict(X) ovo_decision = multi_clf.decision_function(X) # Classifiers are in order 0-1, 0-2, 1-2 # Use decision_function to compute the votes and the normalized # sum_of_confidences, which is used to disambiguate when there is a tie in # votes. votes = np.round(ovo_decision) normalized_confidences = ovo_decision - votes # For the first point, there is one vote per class assert_array_equal(votes[0, :], 1) # For the rest, there is no tie and the prediction is the argmax assert_array_equal(np.argmax(votes[1:], axis=1), ovo_prediction[1:]) # For the tie, the prediction is the class with the highest score assert_equal(ovo_prediction[0], normalized_confidences[0].argmax())
def learn(cat1,cat2,cat3): X = [] Y = [] IDF=get_IDF([cat1,cat2,cat3]) for d in cat1: X.append(MapToEvalVS(d,IDF)); Y.append(0) for d in cat2: X.append(MapToEvalVS(d,IDF)); Y.append(1) for d in cat3: X.append(MapToEvalVS(d,IDF)); Y.append(2) X=np.array(X) Y=np.array(Y) #clf = svm.SVC(verbose=True) #clf=svm.SVC() clf = OneVsOneClassifier(svm.SVC()) #clf=KNeighborsClassifier(weights='distance') clf.fit(X, Y) return [clf,IDF]
def test_ovo_decision_function(): n_samples = iris.data.shape[0] ovo_clf = OneVsOneClassifier(LinearSVC(random_state=0)) ovo_clf.fit(iris.data, iris.target) decisions = ovo_clf.decision_function(iris.data) assert_equal(decisions.shape, (n_samples, n_classes)) assert_array_equal(decisions.argmax(axis=1), ovo_clf.predict(iris.data)) # Compute the votes votes = np.zeros((n_samples, n_classes)) k = 0 for i in range(n_classes): for j in range(i + 1, n_classes): pred = ovo_clf.estimators_[k].predict(iris.data) votes[pred == 0, i] += 1 votes[pred == 1, j] += 1 k += 1 # Extract votes and verify assert_array_equal(votes, np.round(decisions)) for class_idx in range(n_classes): # For each sample and each class, there only 3 possible vote levels # because they are only 3 distinct class pairs thus 3 distinct # binary classifiers. # Therefore, sorting predictions based on votes would yield # mostly tied predictions: assert_true(set(votes[:, class_idx]).issubset(set([0., 1., 2.]))) # The OVO decision function on the other hand is able to resolve # most of the ties on this data as it combines both the vote counts # and the aggregated confidence levels of the binary classifiers # to compute the aggregate decision function. The iris dataset # has 150 samples with a couple of duplicates. The OvO decisions # can resolve most of the ties: assert_greater(len(np.unique(decisions[:, class_idx])), 146)
def multiclass_SVC(X, y): from sklearn.svm import LinearSVC from sklearn import cross_validation # first move: split data X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.35) # one-vs-rest implementation from sklearn.multiclass import OneVsRestClassifier ovr = OneVsRestClassifier(LinearSVC(random_state=0)).fit(X_train, y_train) # one-vs-all implementation from sklearn.multiclass import OneVsOneClassifier ovo = OneVsOneClassifier(LinearSVC(random_state=0)).fit(X_train, y_train) one_vs_rest = ovr.score(X_test, y_test) one_vs_one = ovo.score(X_test, y_test) return one_vs_rest, one_vs_one
def analysis(self, testanalysis=True): if testanalysis: trainingdata, testdata = self.getTrainTestData() else: trainingdata, testdata = self.getRealData() aDict = {} for value in trainingdata: phrase = value.Phrase phrase = phrase.strip() aDict[phrase] = value.Sentiment _all_values = aDict.keys() _all_sentiments = aDict.values() # self.KFOLDTEST(np.asarray(_all_values), np.asarray(_all_sentiments)) count_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=self.tokenize_data) count = count_vectorizer.fit_transform(_all_values) # self.countWordFreq(count_vectorizer, count) tfidf = TfidfTransformer(norm="l2", smooth_idf=False, use_idf=False) data = tfidf.fit_transform(count) classfier = OneVsOneClassifier(LinearSVC()) classfier.fit(data, np.asarray(_all_sentiments)) # Data to write the content into the CSV , for getting this comment the above to take entire training set # as the real data # along with that call the method @getRealData if testanalysis: self.normalexecution(testdata, count_vectorizer, tfidf, classfier) else: self.writeToFile(testdata, count_vectorizer, tfidf, classfier)
def fit(self, X, y): """Check target values and fit model.""" self.classes_ = np.unique(y) n_classes = len(self.classes_) if n_classes < 2: raise ValueError("Need 2 or more classes.") elif n_classes == 2: self.t = np.zeros(y.shape) self.t[y == self.classes_[1]] = 1 return super(RVC, self).fit(X, self.t) else: self.multi_ = None self.multi_ = OneVsOneClassifier(self) self.multi_.fit(X, y) return self
def __init__(self, slack=1, gamma=1, kernelType='linear', gram=1): self.gram = gram self.slack = slack self.gamma = gamma self.kernelType = kernelType self.data = np.ones((1000, 1000)) self.cityClassifier = {} #TODO: Wieso nimmst du OneVsOne und nicht OneVsRest? Ginge OneVsRest nicht schneller? self.countryClassifier = OneVsOneClassifier( svm.SVC(kernel=self.kernelType, C=self.slack, gamma=self.gamma, probability=False, cache_size=1000)) self.bag = None self.numberOfFeatures = 0 #Features and labels self.fitting_data = None self.predict_data = None self.cityPrediction = {} self.countryPrediction = None self.numberOfCityFeatures = {}
class LSVMDetector: # just the training() function changes, rest all remains same. def __init__(self, subjects, data, attacker_data): self.data = data self.attacker = attacker_data self.u_scores = [] self.i_scores = [] self.mean_vector = [] self.subjects = subjects self.fp = [] def training(self): self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto')) labels = [0] * len(self.train) + [1] * len(self.train_imposter) self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels) def testing(self): self.u_scores = self.clf.decision_function(self.test_genuine) self.i_scores = self.clf.decision_function(self.test_imposter) self.u_scores = list(self.u_scores) self.i_scores = list(self.i_scores) def evaluate(self): eers = [] fpr = [] if isinstance(self.subjects, list): for idx, subject in enumerate(self.subjects): genuine_user_data = self.data.loc[self.data.user_id == subject, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[self.data.user_id != subject, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = normalize_np(self.attacker[idx]) # self.test_imposter = normalize_df(imposter_data.groupby("user_id"). \ # head(10).loc[:, # ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', # 'length of trajectory', 'mid-stroke pressure', # 'mid-stroke area covered', # '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', # '20\%-perc. dev. from end-to-end line', # '50\%-perc. dev. from end-to-end line', # '80\%-perc. dev. from end-to-end line']]) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker[idx] self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) # print(evaluateFAR(self.u_scores, self.i_scores)) else: genuine_user_data = self.data.loc[self.data.user_id == self.subjects, \ ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']] imposter_data = self.data.loc[ self.data.user_id != self.subjects, :] # generated_data = attacker_data genuine_user_data = normalize_df(genuine_user_data[:400]) self.train = genuine_user_data[:200] self.test_genuine = genuine_user_data[200:400] # self.test_imposter = imposter_data.groupby("subject"). \ # tail(6).loc[:, "H.period":"H.Return"] # self.test_imposter = normalize_np(self.attacker) self.train_imposter = normalize_df(imposter_data.groupby("user_id"). \ tail(10).loc[:, ["stroke duration", 'start $x$', 'start $y$', 'stop $x$', 'stop $y$', 'length of trajectory', 'mid-stroke pressure', 'mid-stroke area covered', '20\%-perc. pairwise velocity', '50\%-perc. pairwise velocity', '20\%-perc. dev. from end-to-end line', '50\%-perc. dev. from end-to-end line', '80\%-perc. dev. from end-to-end line']]) self.test_imposter = self.attacker self.training() self.testing() # eers.append(evaluateEER(self.u_scores, \ # self.i_scores)) fpr.append(evaluateFAR(self.u_scores, self.i_scores)) return np.mean(fpr)
def training(self): self.clf = OneVsOneClassifier(SVC(kernel='rbf', gamma='auto')) labels = [0] * len(self.train) + [1] * len(self.train_imposter) self.clf.fit(pandas.concat([self.train, self.train_imposter]), labels)
train_label = np.fromfile("../../mnist/mnist_train/mnist_train_label", dtype=np.uint8) test_label = np.fromfile("../../mnist/mnist_test/mnist_test_label", dtype=np.uint8) accuList = [] runtimeList = [] # trying different dimension numbers from 5 to 250 for i in xrange(5, 251, 5): # Dimensionality reduction to target dimension number pca = PCA(n_components=i) train_data_reduce = pca.fit_transform(train_data) test_data_reduce = pca.transform(test_data) print 'dim:{}'.format(i), # Using LinearSVC on i-dimensional data clf = OneVsOneClassifier(LinearSVC(), n_jobs=20) train_time, test_time, accu = run_model(clf, train_data_reduce, train_label, test_data_reduce, test_label) # print the performance print 'train time:{} test time:{} accuracy:{}'.format( train_time, test_time, accu) runtimeList.append(train_time + test_time) accuList.append(accu) # Plot the accuracy figure. plt.figure() plt.plot(np.arange(5, 251, 5), accuList) plt.xlabel('dimension') plt.ylabel('accuracy') plt.savefig('pca-accu.pdf')
class RobustWeightedClassifier(BaseEstimator, ClassifierMixin): """Algorithm for robust classification using reweighting algorithm. This model use iterative reweighting of samples to make a regression or classification estimator robust. The principle of the algorithm is to use an empirical risk minimization principle where the risk is estimated using a robust estimator (for example Huber estimator or median-of-means estimator)[1], [3]. The idea behind this algorithm was mentioned before in [2]. This idea translates in an iterative algorithm where the sample_weight are changed at each iterations and are dependent of the sample. Informally the outliers should have small weight while the inliers should have big weight, where outliers are sample with a big loss function. This algorithm enjoy a non-zero breakdown-point (it can handle arbitrarily bad outliers). When the "mom" weighting scheme is used, k outliers can be tolerated. When the "Huber" weighting scheme is used, asymptotically the number of outliers has to be less than half the sample size. Read more in the :ref:`User Guide <robust>`. Parameters ---------- weighting : string, default="huber" Weighting scheme used to make the estimator robust. Can be 'huber' for huber-type weights or 'mom' for median-of-means type weights. max_iter : int, default=100 Maximum number of iterations. For more information, see the optimization scheme of base_estimator and the eta0 and burn_in parameter. burn_in : int, default=10 Number of steps used without changing the learning rate. Can be useful to make the weight estimation better at the beginning. eta0 : float, default=0.01 Constant step-size used during the burn_in period. Used only if burn_in>0. Can have a big effect on efficiency. c : float>0 or None, default=None Parameter used for Huber weighting procedure, used only if weightings is 'huber'. Measure the robustness of the weighting procedure. A small value of c means a more robust estimator. Can have a big effect on efficiency. If None, c is estimated at each step using half the Inter-quartile range, this tends to be conservative (robust). k : int < sample_size/2, default=1 Parameter used for mom weighting procedure, used only if weightings is 'mom'. 2k+1 is the number of blocks used for median-of-means estimation, higher value of k means a more robust estimator. Can have a big effect on efficiency. If None, k is estimated using the number of points distant from the median of means of more than 2 times a robust estimate of the scale (using the inter-quartile range), this tends to be conservative (robust). loss : string, None or callable, default="log" Name of the loss used, must be the same loss as the one optimized in base_estimator. Classification losses supported : 'log', 'hinge'. If 'log', then the base_estimator must support predict_proba. Regression losses supported : 'squared_loss', . sgd_args : dict, default={} arguments of the SGDClassifier base estimator. multi_class : string, default="ovr" multi-class scheme. Can be either "ovo" for OneVsOneClassifier or "ovr" for OneVsRestClassifier or "binary" for binary classification. n_jobs : int, default=1 number of jobs used in the multi-class meta-algorithm computation. tol : float or None, (default = 1e-3) The stopping criterion. If it is not None, training will stop when (loss > best_loss - tol) for n_iter_no_change consecutive epochs. n_iter_no_change : int, default=10 Number of iterations with no improvement to wait before early stopping. random_state : int, RandomState instance or None, optional (default=None) The seed of the pseudo random number generator to use when shuffling the data. If int, random_state is the seed used by the random number generator; If RandomState instance, random_state is the random number generator; If None, the random number generator is the RandomState instance used by np.random. Attributes ---------- classes_ : ndarray of shape (n_classes, ) A list of class labels known to the classifier. coef_ : ndarray of shape (1, n_features) or (n_classes, n_features) Coefficient of the features in the decision function. Only available if multi_class = "binary" intercept_ : ndarray of shape (1,) or (n_classes,) Intercept (a.k.a. bias) added to the decision function. Only available if multi_class = "binary" n_iter_ : ndarray of shape (n_classes,) or (1, ) Actual number of iterations for all classes. If binary or multinomial, it returns only 1 element. For liblinear solver, only the maximum number of iteration across all classes is given. base_estimator_ : object, The fitted base estimator SGDCLassifier. weights_ : array like, length = n_sample. Weight of each sample at the end of the algorithm. Can be used as a measure of how much of an outlier a sample is. Only available if multi_class = "binary" Notes ----- Often, there is a need to use RobustScaler as preprocessing. Examples -------- >>> from sklearn_extra.robust import RobustWeightedClassifier >>> from sklearn.datasets import make_blobs >>> import numpy as np >>> rng = np.random.RandomState(42) >>> X,y = make_blobs(n_samples=100, centers=np.array([[-1, -1], [1, 1]]), ... random_state=rng) >>> clf=RobustWeightedClassifier() >>> _ = clf.fit(X, y) >>> score = np.mean(clf.predict(X)==y) References ---------- [1] Guillaume Lecué, Matthieu Lerasle and Timothée Mathieu. "Robust classification via MOM minimization", Mach Learn 109, (2020). https://doi.org/10.1007/s10994-019-05863-6 (2018). arXiv:1808.03106 [2] Christian Brownlees, Emilien Joly and Gábor Lugosi. "Empirical risk minimization for heavy-tailed losses", Ann. Statist. Volume 43, Number 6 (2015), 2507-2536. [3] Stanislav Minsker and Timothée Mathieu. "Excess risk bounds in robust empirical risk minimization" arXiv preprint (2019). arXiv:1910.07485. """ def __init__( self, weighting="huber", max_iter=100, burn_in=10, eta0=0.01, c=None, k=0, loss="log", sgd_args=None, multi_class="ovr", n_jobs=1, tol=1e-3, n_iter_no_change=10, random_state=None, ): self.weighting = weighting self.max_iter = max_iter self.burn_in = burn_in self.eta0 = eta0 self.c = c self.k = k self.loss = loss self.sgd_args = sgd_args self.multi_class = multi_class self.n_jobs = n_jobs self.tol = tol self.n_iter_no_change = n_iter_no_change self.random_state = random_state def fit(self, X, y): """Fit the model to data matrix X and target(s) y. Parameters ---------- X : array-like or sparse matrix, shape (n_samples, n_features) The input data. y : array-like, shape (n_samples,) or (n_samples, n_outputs) The target values (class labels in classification, real numbers in regression). Returns ------- self : returns an estimator trained with RobustWeightedClassifier. """ if self.sgd_args is None: sgd_args = {} else: sgd_args = self.sgd_args # Define the base estimator base_robust_estimator_ = _RobustWeightedEstimator( SGDClassifier(**sgd_args, loss=self.loss), weighting=self.weighting, loss=self.loss, burn_in=self.burn_in, c=self.c, k=self.k, eta0=self.eta0, max_iter=self.max_iter, tol=self.tol, n_iter_no_change=self.n_iter_no_change, random_state=self.random_state, ) if self.multi_class == "ovr": self.base_estimator_ = OneVsRestClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) elif self.multi_class == "binary": self.base_estimator_ = base_robust_estimator_ elif self.multi_class == "ovo": self.base_estimator_ = OneVsOneClassifier( base_robust_estimator_, n_jobs=self.n_jobs ) else: raise ValueError("No such multiclass method implemented.") self.base_estimator_.fit(X, y) if self.multi_class == "binary": self.weights_ = self.base_estimator_.weights_ self.coef_ = self.base_estimator_.coef_ self.intercept_ = self.base_estimator_.intercept_ self.n_iter_ = self.max_iter * len(X) self.classes_ = self.base_estimator_.classes_ return self def predict(self, X): """Predict using the estimator trained with RobustWeightedClassifier. Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) The input data. Returns ------- y : array-like, shape (n_samples, n_outputs) The predicted values. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.predict(X) def _check_proba(self): if self.loss != "log": raise AttributeError( "Probability estimates are not available for" " loss=%r" % self.loss ) @property def predict_proba(self): """ Probability estimates when binary classification. Parameters ---------- X : array-like of shape (n_samples, n_features) Vector to be scored, where `n_samples` is the number of samples and `n_features` is the number of features. Returns ------- T : array-like of shape (n_samples, n_classes) Returns the probability of the sample for each class in the model, """ check_is_fitted(self, attributes=["base_estimator_"]) self._check_proba() return self._predict_proba def _predict_proba(self, X): return self.base_estimator_.predict_proba(X) @property def _estimator_type(self): return self.base_estimator._estimator_type def score(self, X, y=None): """Returns the score on the given data, using ``base_estimator_.score``. Parameters ---------- X : array-like of shape (n_samples, n_features) Input data, where n_samples is the number of samples and n_features is the number of features. y : array-like of shape (n_samples, n_output) or (n_samples,), optional Target relative to X for classification or regression; None for unsupervised learning. Returns ------- score : float """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.score(X, y) def decision_function(self, X): """Predict using the linear model Parameters ---------- X : {array-like, sparse matrix}, shape (n_samples, n_features) Returns ------- array, shape (n_samples,) Predicted target values per element in X. """ check_is_fitted(self, attributes=["base_estimator_"]) return self.base_estimator_.decision_function(X)
def test_ovo_partial_fit_predict(): X, y = shuffle(iris.data, iris.target) ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert_equal(len(ovo1.estimators_), n_classes * (n_classes - 1) / 2) assert_greater(np.mean(y == pred1), 0.65) assert_almost_equal(pred1, pred2) # Test when mini-batches don't have all target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(iris.data[:60], iris.target[:60], np.unique(iris.target)) ovo1.partial_fit(iris.data[60:], iris.target[60:]) pred1 = ovo1.predict(iris.data) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(iris.data, iris.target).predict(iris.data) assert_almost_equal(pred1, pred2) assert_equal(len(ovo1.estimators_), len(np.unique(iris.target))) assert_greater(np.mean(iris.target == pred1), 0.65)
def all_classifier_models(): models = [] metrix = [] c_report = [] train_accuracy = [] test_accuracy = [] models.append(('LogisticRegression', LogisticRegression(solver='liblinear', multi_class='ovr'))) models.append(('LinearDiscriminantAnalysis', LinearDiscriminantAnalysis())) models.append(('KNeighborsClassifier', KNeighborsClassifier())) models.append(('DecisionTreeClassifier', DecisionTreeClassifier())) models.append(('GaussianNB', GaussianNB())) models.append(('RandomForestClassifier', RandomForestClassifier(n_estimators=100))) models.append(('SVM', SVC(gamma='auto'))) models.append(('Linear_SVM', LinearSVC())) models.append(('XGB', XGBClassifier())) models.append(('SGD', SGDClassifier())) models.append(('Perceptron', Perceptron())) models.append(('ExtraTreeClassifier', ExtraTreeClassifier())) models.append(('OneClassSVM', OneClassSVM(gamma = 'auto'))) models.append(('NuSVC', NuSVC())) models.append(('MLPClassifier', MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1))) models.append(('RadiusNeighborsClassifier', RadiusNeighborsClassifier(radius=2.0))) models.append(('OutputCodeClassifier', OutputCodeClassifier(estimator=RandomForestClassifier(random_state=0),random_state=0))) models.append(('OneVsOneClassifier', OneVsOneClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('OneVsRestClassifier', OneVsRestClassifier(estimator = RandomForestClassifier(random_state=1)))) models.append(('LogisticRegressionCV', LogisticRegressionCV())) models.append(('RidgeClassifierCV', RidgeClassifierCV())) models.append(('RidgeClassifier', RidgeClassifier())) models.append(('PassiveAggressiveClassifier', PassiveAggressiveClassifier())) models.append(('GaussianProcessClassifier', GaussianProcessClassifier())) models.append(('HistGradientBoostingClassifier', HistGradientBoostingClassifier())) estimators = [('rf', RandomForestClassifier(n_estimators=10, random_state=42)),('svr', make_pipeline(StandardScaler(),LinearSVC(random_state=42)))] models.append(('StackingClassifier', StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()))) clf1 = LogisticRegression(multi_class='multinomial', random_state=1) clf2 = RandomForestClassifier(n_estimators=50, random_state=1) clf3 = GaussianNB() models.append(('VotingClassifier', VotingClassifier(estimators=[('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard'))) models.append(('AdaBoostClassifier', AdaBoostClassifier())) models.append(('GradientBoostingClassifier', GradientBoostingClassifier())) models.append(('BaggingClassifier', BaggingClassifier())) models.append(('ExtraTreesClassifier', ExtraTreesClassifier())) models.append(('CategoricalNB', CategoricalNB())) models.append(('ComplementNB', ComplementNB())) models.append(('BernoulliNB', BernoulliNB())) models.append(('MultinomialNB', MultinomialNB())) models.append(('CalibratedClassifierCV', CalibratedClassifierCV())) models.append(('LabelPropagation', LabelPropagation())) models.append(('LabelSpreading', LabelSpreading())) models.append(('NearestCentroid', NearestCentroid())) models.append(('QuadraticDiscriminantAnalysis', QuadraticDiscriminantAnalysis())) models.append(('GaussianMixture', GaussianMixture())) models.append(('BayesianGaussianMixture', BayesianGaussianMixture())) test_accuracy= [] names = [] for name, model in models: try: m = model m.fit(X_train, y_train) y_pred = m.predict(X_test) train_acc = round(m.score(X_train, y_train) * 100, 2) test_acc = metrics.accuracy_score(y_test,y_pred) *100 c_report.append(classification_report(y_test, y_pred)) test_accuracy.append(test_acc) names.append(name) metrix.append([name, train_acc, test_acc]) except: print("Exception Occurred :",name) return metrix,test_accuracy,names
def test_ovo_exceptions(): ovo = OneVsOneClassifier(LinearSVC(random_state=0)) assert_raises(ValueError, ovo.predict, [])
#Um versos o resto/todo (One versus Rest/All) from sklearn.multiclass import OneVsRestClassifier from sklearn.svm import LinearSVC #0 => 0 e 1 => 1,2, LinearSVC vai falar que eh do tipo 0 ou do resto (38%, resto 62%) #0 => 0,2 e 1 => 1 LinearSVC vai falar que eh do tipo 1 ou do resto (44%, resto 56%) #0 => 0,1 e 2 => 2, LinearSVC vai falar que eh do tipo 0 ou do resto (20%, restp80%) modeloOneVsRest = OneVsRestClassifier(LinearSVC(random_state=0)) resultadoOneVsRest = fit_and_predict(modeloOneVsRest, treino_dados, treino_marcacoes, teste_dados, teste_marcacoes, "One Vs Rest Classifier") resultados[resultadoOneVsRest] = modeloOneVsRest #OneVsOne from sklearn.multiclass import OneVsOneClassifier modeloOneVsOne = OneVsOneClassifier(LinearSVC(random_state=0)) resultadoOneVsOne = fit_and_predict(modeloOneVsOne, treino_dados, treino_marcacoes, teste_dados, teste_marcacoes, "One Vs One") resultados[resultadoOneVsOne] = modeloOneVsOne #importa o algoritmo de classidicacao MultinomialNB from sklearn.naive_bayes import MultinomialNB #atribui o algoritmo MultinomialNB a variavel chamada modelo modeloMultinomial = MultinomialNB() resultadoMultinomial = fit_and_predict(modeloMultinomial, treino_dados, treino_marcacoes, teste_dados, teste_marcacoes, "MultinomialNB") resultados[resultadoMultinomial] = modeloMultinomial #Algotimo que encontra a melhor possibilidade de combinacoes das caracteristicas dos dados de treino
# Separating training and validation training_percentage = 0.8 len_training = int(training_percentage * len(Y)) len_validation = len(Y) - len_training training_data = X[0:len_training] training_marker = Y[0:len_training] validation_data = X[len_training:] validation_marker = Y[len_training:] results = {} # Applying to OneVsRestClassifier one_vs_rest_model = OneVsRestClassifier(LinearSVC(random_state=0)) one_vs_one_model = OneVsOneClassifier(LinearSVC(random_state=0)) multinomial_model = MultinomialNB() ada_boost_model = AdaBoostClassifier() results['OneVsRestClassifier'] = fit_and_predict("OneVsRestClassifier", one_vs_rest_model, training_data, training_marker, 10) results['OneVsOneClassifier'] = fit_and_predict("OneVsOneClassifier", one_vs_one_model, training_data, training_marker, 10) results['MultinomialNB'] = fit_and_predict("MultinomialNB", multinomial_model, training_data, training_marker, 10)
def main(): # Checks for correct number of arguments if len(sys.argv) != 3: print( 'usage: ./troll_identifier.py [TRAIN DATASET] [TEST/DEV DATASET]') sys.exit() # set up dataset data_train = pd.read_csv(sys.argv[1]) data_test = pd.read_csv(sys.argv[2]) print('train: {}'.format(sys.argv[1])) print('test: {}'.format(sys.argv[2])) x_train = data_train.drop( [data_train.columns[0], data_train.columns[1], data_train.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_train = pd.Series(data_train.iloc[:, -1]) x_test = data_test.drop( [data_test.columns[0], data_test.columns[1], data_test.columns[-1]], axis=1).apply(pd.to_numeric, errors='ignore') y_test = pd.Series(data_test.iloc[:, -1]) type = input('type: [1: supervised, 2: semi-supervised, 3: unsupervised] ') if type == 1: method = input('method: [1: classification, 2: regression] ') if method == 1: classifier = input( 'classifier: [1: decision tree, 2: extra tree, 3: extra trees, 4: k nearest neighbor, 5: naive bayes, 6: radius neighbors, 7: random forest, 8: support vector machine, 9: gradient boosting, 10: gaussian process, 11: stochastic gradient descent, 12: passive aggressive, 13: nearest centroid, 14: perceptron, 15: multi-layer perceptron, 16: ada boost] ' ) if classifier == 1: criterion = input('criterion: [1: gini, 2: entropy] ') if criterion == 1: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='gini') elif criterion == 2: print(type, method, classifier, criterion) model = DecisionTreeClassifier(criterion='entropy') else: print('no criterion chosen') exit() elif classifier == 2: print(type, method, classifier) model = ExtraTreeClassifier() elif classifier == 3: print(type, method, classifier) model = ExtraTreesClassifier() elif classifier == 4: n = input('n: [1: 1, 2: 3: 3: 5] ') if n == 1: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=1) elif n == 2: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=3) elif n == 3: print(type, method, classifier, n) model = KNeighborsClassifier(n_neighbors=5) else: print('no n chosen') exit() elif classifier == 5: version = input( 'version: [1: gaussian, 2: bernoulli, 3: multinomial, 4: complement] ' ) if version == 1: print(type, method, classifier, version) model = GaussianNB() elif version == 2: print(type, method, classifier, version) model = BernoulliNB() elif version == 3: print(type, method, classifier, version) model = MultinomialNB() elif version == 4: print(type, method, classifier, version) model = ComplementNB() else: print('no version chosen') exit() elif classifier == 6: print(type, method, classifier) model = RadiusNeighborsClassifier(radius=1.0) elif classifier == 7: print(type, method, classifier) model = RandomForestClassifier(n_estimators=50, random_state=1) elif classifier == 8: print(type, method, classifier) model = LinearSVC( multi_class='crammer_singer') #multi_class='ovr' elif classifier == 9: print(type, method, classifier) model = GradientBoostingClassifier() elif classifier == 10: print(type, method, classifier) model = GaussianProcessClassifier(multi_class='one_vs_one') # model = GaussianProcessClassifier(multi_class='one_vs_rest') elif classifier == 11: print(type, method, classifier) model = SGDClassifier() elif classifier == 12: print(type, method, classifier) model = PassiveAggressiveClassifier() elif classifier == 13: print(type, method, classifier) model = NearestCentroid() elif classifier == 14: print(type, method, classifier) model = Perceptron(tol=1e-3, random_state=0) elif classifier == 15: print(type, method, classifier) model = MLPClassifier() elif classifier == 16: print(type, method, classifier) model = AdaBoostClassifier(n_estimators=100) else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) filename = '{},{},{}.txt'.format(type, method, classifier) with open(filename, 'w') as output: output.write('{:10}\t{:10}\t{:10}\t{:10}'.format( 'actual', 'predict', 'approximate', 'match?')) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False output.write('{:10}\t{:10}\t{:10}'.format( y_train[i], predictions[i], match)) output.write('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print('accuracy: {:7.2f}%'.format( 100 * accuracy_score(y_test, predictions))) print( classification_report( y_test, predictions, target_names=['RightTroll', 'LeftTroll', 'Other'])) print( confusion_matrix(y_test, predictions, labels=["RightTroll", "LeftTroll", "Other"])) elif method == 2: # transform into binary classification problem # y_train = y_train.apply(lambda x: 0 if x == 'Other' else 1) # y_test = y_test.apply(lambda x: 0 if x == 'Other' else 1) # transform string labels into integers # le = LabelEncoder() # le.fit(y_train) # print(le.transform(['LeftTroll', 'Other', 'Other', 'RightTroll'])), print(le.inverse_transform([0, 1, 2, 1])) # print(le.classes_) # # y_train = le.transform(y_train) # y_test = le.transform(y_test) regressor = input( 'regressor: [1: linear discriminant analysis, 2: logistic regression, 3: ridge regression, 4: quadratic discriminant analysis, 5: linear regression, 6: decision tree regression, 7: pls regression, 8: pls canonical, 9: canonical correlation analysis, 10: lasso, 11: multi-task lasso, 12: elastic net, 13: multi-task elastic net, 14: least angle regression, 15: least angle regression lasso, 16: orthogonal matching pursuit, 17: bayesian ridge, 18: automatic relevence determination, 19: theil sen regression, 20: huber regressor, 21: random sample consensus] ' ) if regressor == 1: print(type, method, regressor) model = LinearDiscriminantAnalysis() elif regressor == 2: print(type, method, regressor) model = LogisticRegression( solver='lbfgs', multi_class='multinomial') #'newton-cg' elif regressor == 3: print(type, method, regressor) model = RidgeClassifier() elif regressor == 4: print(type, method, regressor) model = QuadraticDiscriminantAnalysis() elif regressor == 5: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(LinearRegression()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(LinearRegression()) else: print('no strategy selected') exit() elif regressor == 6: strategy = input('strategy: [1: one vs rest, 2: one vs one] ') if strategy == 1: print(type, method, strategy, regressor) model = OneVsRestClassifier(DecisionTreeRegressor()) elif strategy == 2: print(type, method, strategy, regressor) model = OneVsOneClassifier(DecisionTreeRegressor()) else: print('no strategy selected') exit() elif regressor == 7: print(type, method, regressor) model = PLSRegression(n_components=2) elif regressor == 8: print(type, method, regressor) model = PLSCanonical(n_components=2) elif regressor == 9: print(type, method, regressor) model = CCA(n_components=1) elif regressor == 10: print(type, method, regressor) model = Lasso(alpha=0.1) elif regressor == 11: print(type, method, regressor) model = MultiTaskLasso(alpha=0.1) elif regressor == 12: print(type, method, regressor) model = ElasticNet(random_state=0) elif regressor == 13: print(type, method, regressor) model = MultiTaskElasticNet(random_state=0) elif regressor == 14: print(type, method, regressor) model = Lars(n_nonzero_coefs=1) elif regressor == 15: print(type, method, regressor) model = LassoLars(alpha=.1) elif regressor == 16: print(type, method, regressor) model = OrthogonalMatchingPursuit() elif regressor == 17: print(type, method, regressor) model = BayesianRidge() elif regressor == 18: print(type, method, regressor) model = ARDRegression() elif regressor == 19: print(type, method, regressor) model = TheilSenRegressor(random_state=0) elif regressor == 20: print(type, method, regressor) model = HuberRegressor() elif regressor == 21: print(type, method, regressor) model = RANSACRegressor(random_state=0) else: print('no regressor chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # print('coefficient:', model.coef_) # print('intercept:', model.intercept_) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no method chosen') exit() elif type == 2: classifier = input( 'classifier: [1: label propagation, 2: label spreading] ') if classifier == 1: print(type, classifier) model = LabelPropagation() elif classifier == 2: print(type, classifier) model = LabelSpreading() else: print('no classifier chosen') exit() # train the model using the training sets and check score model.fit(x_train, y_train) model.score(x_train, y_train) # predict output predictions = pd.Series(model.predict(x_test)) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) elif type == 3: method = input( 'method: [1: clustering, 2: random trees embedding, 3: nearest neighbors] ' ) if method == 1: clusterer = input('clustere: [1: k means]') if clusterer == 1: clusters = input('clusters: [1: 1, 2: 2, 3: 3] ') if clusters == 1: print(type, method, clusters) model = KMeans(n_clusters=1, random_state=0) elif clusters == 2: print(type, method, clusters) model = KMeans(n_clusters=2, random_state=0) elif clusters == 3: print(type, method, clusters) model = KMeans(n_clusters=3, random_state=0) else: print('no clusters chosen') exit() else: print('no clusterer chosen') exit() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.predict(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) # check details print('centroids: ' + model.cluster_centers_) # print('labels: ' + model.labels_) elif method == 2: model = RandomTreesEmbedding() # train the model using the training sets and check score model.fit(x_train) # predict output predictions = model.apply(x_test) print('{:10}\t{:10}\t{:10}'.format('actual', 'predict', 'match?')) elif method == 3: model = NearestNeighbors(n_neighbors=2, algorithm='ball_tree') # train the model using the training sets and check score model.fit(x_train) distances, indices = nbrs.kneighbors(X) else: print('no method chosen') exit() # calculate accuracy numerator = 0.0 denominator = float(len(predictions)) for i in range(len(predictions)): match = True if (y_test[i] == predictions[i]) else False numerator += 1 if match else 0 print('{:10}\t{:10}\t{:10}'.format(y_train[i], predictions[i], match)) print('accuracy = {:7.2f}%'.format(100 * numerator / denominator)) else: print('no type chosen') exit()
from sklearn import svm, datasets from sklearn.multiclass import OneVsOneClassifier #调用SVC() clf = svm.SVC() #载入鸢尾花数据集 iris = datasets.load_iris() print(iris) X, y = iris.data, iris.target clf = svm.LinearSVC(random_state=0) clf = OneVsOneClassifier(clf) # 根据二分类器构建多分类器 clf.fit(X, y) # 训练模型 y_pred = clf.predict(X) # 预测样本 print('预测正确的个数:%d,预测错误的个数:%d' % ((y == y_pred).sum(), (y != y_pred).sum()))
df = pd.read_csv("foods.csv") Y = df["clss"].values X = df.drop(["brands", "countries", "product_name", "clss"], axis = 1) X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, test_size = 0.25) scaler = StandardScaler() scaler.fit(X_train) X_train = scaler.transform(X_train) X_test = scaler.transform(X_test) # One-vs-all: Sklearn hat automatisch erkannt, wir möchten hier # mehrere Klassen vorhersagen - daher wird per default # die One-vs-all-Methode (OneVsRestClassifier) verwendet. model = LogisticRegression(solver='lbfgs', multi_class='auto') #solver/multi_class siehe Doku model.fit(X_train, Y_train) print(model.score(X_test, Y_test)) # One-vs-one model = OneVsOneClassifier(LogisticRegression(solver='lbfgs', multi_class='auto')) model.fit(X_train, Y_train) print(model.score(X_test, Y_test))
class GaussianProcessClassifier(BaseEstimator, ClassifierMixin): """Gaussian process classification (GPC) based on Laplace approximation. The implementation is based on Algorithm 3.1, 3.2, and 5.1 of ``Gaussian Processes for Machine Learning'' (GPML) by Rasmussen and Williams. Internally, the Laplace approximation is used for approximating the non-Gaussian posterior by a Gaussian. Currently, the implementation is restricted to using the logistic link function. For multi-class classification, several binary one-versus rest classifiers are fitted. Note that this class thus does not implement a true multi-class Laplace approximation. Parameters ---------- kernel : kernel object The kernel specifying the covariance function of the GP. If None is passed, the kernel "1.0 * RBF(1.0)" is used as default. Note that the kernel's hyperparameters are optimized during fitting. optimizer : string or callable, optional (default: "fmin_l_bfgs_b") Can either be one of the internally supported optimizers for optimizing the kernel's parameters, specified by a string, or an externally defined optimizer passed as a callable. If a callable is passed, it must have the signature:: def optimizer(obj_func, initial_theta, bounds): # * 'obj_func' is the objective function to be maximized, which # takes the hyperparameters theta as parameter and an # optional flag eval_gradient, which determines if the # gradient is returned additionally to the function value # * 'initial_theta': the initial value for theta, which can be # used by local optimizers # * 'bounds': the bounds on the values of theta .... # Returned are the best found hyperparameters theta and # the corresponding value of the target function. return theta_opt, func_min Per default, the 'fmin_l_bfgs_b' algorithm from scipy.optimize is used. If None is passed, the kernel's parameters are kept fixed. Available internal optimizers are:: 'fmin_l_bfgs_b' n_restarts_optimizer: int, optional (default: 0) The number of restarts of the optimizer for finding the kernel's parameters which maximize the log-marginal likelihood. The first run of the optimizer is performed from the kernel's initial parameters, the remaining ones (if any) from thetas sampled log-uniform randomly from the space of allowed theta-values. If greater than 0, all bounds must be finite. Note that n_restarts_optimizer=0 implies that one run is performed. max_iter_predict: int, optional (default: 100) The maximum number of iterations in Newton's method for approximating the posterior during predict. Smaller values will reduce computation time at the cost of worse results. warm_start : bool, optional (default: False) If warm-starts are enabled, the solution of the last Newton iteration on the Laplace approximation of the posterior mode is used as initialization for the next call of _posterior_mode(). This can speed up convergence when _posterior_mode is called several times on similar problems as in hyperparameter optimization. copy_X_train : bool, optional (default: True) If True, a persistent copy of the training data is stored in the object. Otherwise, just a reference to the training data is stored, which might cause predictions to change if the data is modified externally. random_state : integer or numpy.RandomState, optional The generator used to initialize the centers. If an integer is given, it fixes the seed. Defaults to the global numpy random number generator. multi_class: string, default: "one_vs_rest" Specifies how multi-class classification problems are handled. Supported are "one_vs_rest" and "one_vs_one". In "one_vs_rest", one binary Gaussian process classifier is fitted for each class, which is trained to separate this class from the rest. In "one_vs_one", one binary Gaussian process classifier is fitted for each pair of classes, which is trained to separate these two classes. The predictions of these binary predictors are combined into multi-class predictions. Note that "one_vs_one" does not support predicting probability estimates. n_jobs : int, optional, default: 1 The number of jobs to use for the computation. If -1 all CPUs are used. If 1 is given, no parallel computing code is used at all, which is useful for debugging. For n_jobs below -1, (n_cpus + 1 + n_jobs) are used. Thus for n_jobs = -2, all CPUs but one are used. Attributes ---------- kernel_ : kernel object The kernel used for prediction. In case of binary classification, the structure of the kernel is the same as the one passed as parameter but with optimized hyperparameters. In case of multi-class classification, a CompoundKernel is returned which consists of the different kernels used in the one-versus-rest classifiers. log_marginal_likelihood_value_: float The log-marginal-likelihood of self.kernel_.theta classes_ : array-like, shape = (n_classes,) Unique class labels. n_classes_ : int The number of classes in the training data """ def __init__(self, kernel=None, optimizer="fmin_l_bfgs_b", n_restarts_optimizer=0, max_iter_predict=100, warm_start=False, copy_X_train=True, random_state=None, multi_class="one_vs_rest", n_jobs=1): self.kernel = kernel self.optimizer = optimizer self.n_restarts_optimizer = n_restarts_optimizer self.max_iter_predict = max_iter_predict self.warm_start = warm_start self.copy_X_train = copy_X_train self.random_state = random_state self.multi_class = multi_class self.n_jobs = n_jobs def fit(self, X, y): """Fit Gaussian process classification model Parameters ---------- X : array-like, shape = (n_samples, n_features) Training data y : array-like, shape = (n_samples,) Target values, must be binary Returns ------- self : returns an instance of self. """ X, y = check_X_y(X, y, multi_output=False) self.base_estimator_ = _BinaryGaussianProcessClassifierLaplace( self.kernel, self.optimizer, self.n_restarts_optimizer, self.max_iter_predict, self.warm_start, self.copy_X_train, self.random_state) self.classes_ = np.unique(y) self.n_classes_ = self.classes_.size if self.n_classes_ == 1: raise ValueError("GaussianProcessClassifier requires 2 or more " "distinct classes. Only class %s present." % self.classes_[0]) if self.n_classes_ > 2: if self.multi_class == "one_vs_rest": self.base_estimator_ = \ OneVsRestClassifier(self.base_estimator_, n_jobs=self.n_jobs) elif self.multi_class == "one_vs_one": self.base_estimator_ = \ OneVsOneClassifier(self.base_estimator_, n_jobs=self.n_jobs) else: raise ValueError("Unknown multi-class mode %s" % self.multi_class) self.base_estimator_.fit(X, y) if self.n_classes_ > 2: self.log_marginal_likelihood_value_ = np.mean([ estimator.log_marginal_likelihood() for estimator in self.base_estimator_.estimators_ ]) else: self.log_marginal_likelihood_value_ = \ self.base_estimator_.log_marginal_likelihood() return self def predict(self, X): """Perform classification on an array of test vectors X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array, shape = (n_samples,) Predicted target values for X, values are from classes_ """ check_is_fitted(self, ["classes_", "n_classes_"]) X = check_array(X) return self.base_estimator_.predict(X) def predict_proba(self, X): """Return probability estimates for the test vector X. Parameters ---------- X : array-like, shape = (n_samples, n_features) Returns ------- C : array-like, shape = (n_samples, n_classes) Returns the probability of the samples for each class in the model. The columns correspond to the classes in sorted order, as they appear in the attribute `classes_`. """ check_is_fitted(self, ["classes_", "n_classes_"]) if self.n_classes_ > 2 and self.multi_class == "one_vs_one": raise ValueError("one_vs_one multi-class mode does not support " "predicting probability estimates. Use " "one_vs_rest mode instead.") X = check_array(X) return self.base_estimator_.predict_proba(X) @property def kernel_(self): if self.n_classes_ == 2: return self.base_estimator_.kernel_ else: return CompoundKernel([ estimator.kernel_ for estimator in self.base_estimator_.estimators_ ]) def log_marginal_likelihood(self, theta=None, eval_gradient=False): """Returns log-marginal likelihood of theta for training data. In the case of multi-class classification, the mean log-marginal likelihood of the one-versus-rest classifiers are returned. Parameters ---------- theta : array-like, shape = (n_kernel_params,) or none Kernel hyperparameters for which the log-marginal likelihood is evaluated. In the case of multi-class classification, theta may be the hyperparameters of the compound kernel or of an individual kernel. In the latter case, all individual kernel get assigned the same theta values. If None, the precomputed log_marginal_likelihood of self.kernel_.theta is returned. eval_gradient : bool, default: False If True, the gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta is returned additionally. Note that gradient computation is not supported for non-binary classification. If True, theta must not be None. Returns ------- log_likelihood : float Log-marginal likelihood of theta for training data. log_likelihood_gradient : array, shape = (n_kernel_params,), optional Gradient of the log-marginal likelihood with respect to the kernel hyperparameters at position theta. Only returned when eval_gradient is True. """ check_is_fitted(self, ["classes_", "n_classes_"]) if theta is None: if eval_gradient: raise ValueError( "Gradient can only be evaluated for theta!=None") return self.log_marginal_likelihood_value_ theta = np.asarray(theta) if self.n_classes_ == 2: return self.base_estimator_.log_marginal_likelihood( theta, eval_gradient) else: if eval_gradient: raise NotImplementedError( "Gradient of log-marginal-likelhood not implemented for " "multi-class GPC.") estimators = self.base_estimator_.estimators_ n_dims = estimators[0].kernel_.n_dims if theta.shape[0] == n_dims: # use same theta for all sub-kernels return np.mean([ estimator.log_marginal_likelihood(theta) for i, estimator in enumerate(estimators) ]) elif theta.shape[0] == n_dims * self.classes_.shape[0]: # theta for compound kernel return np.mean([ estimator.log_marginal_likelihood(theta[n_dims * i:n_dims * (i + 1)]) for i, estimator in enumerate(estimators) ]) else: raise ValueError( "Shape of theta must be either %d or %d. " "Obtained theta with shape %d." % (n_dims, n_dims * self.classes_.shape[0], theta.shape[0]))
def get_estimator(self): return OneVsOneClassifier(LogisticRegression())
from sklearn.ensemble import AdaBoostClassifier from sklearn.tree import DecisionTreeClassifier from matplotlib import pyplot from collections import Counter train_data = pd.read_csv("train_data_clean.csv", header=0) validation_data = pd.read_csv("validation_data_clean.csv", header=0) test_data = pd.read_csv("test_data_clean.csv", header=0) pred_data = pd.read_csv("ElectionsData_Pred_Features.csv", header=0) train_val_list = [train_data, validation_data] train_val_data = pd.concat(train_val_list) features = train_val_data.drop(['label'], axis=1).values target = train_val_data.label.values clf = OneVsOneClassifier(LinearSVC(C=1.0, random_state=0)) pred = cross_val_predict(clf, features, target, cv=30, n_jobs=-1) print( classification_report(target, pred, target_names=train_val_data.label.unique())) from sklearn.ensemble import RandomForestClassifier clf = RandomForestClassifier(min_samples_split=5, random_state=0, n_estimators=100, n_jobs=-1, verbose=1, class_weight="balanced") clf.fit(features, target)
def main(): # import the data from sklearn.datasets import fetch_openml mnist = fetch_openml('mnist_784') x, y = mnist["data"], mnist["target"] print(x.shape) print(y.shape) # show the image some_digit = x[36000] some_digit_image = some_digit.reshape(28, 28) plt.imshow(some_digit_image, cmap=matplotlib.cm.binary, interpolation="nearest") plt.axis("off") plt.show() # prepare the testing/training tests x_train, x_test, y_train, y_test = x[:60000], x[60000:], y[:60000], y[60000:] np.random.seed(3) shuffle_index = np.random.permutation(60000) x_train, y_train = x_train[shuffle_index], y_train[shuffle_index] # Binary Classifier y_train_5 = (y_train == '5') # True for all 5s y_test_5 = (y_test == '5') # make sure it's int not chars from sklearn.linear_model import SGDClassifier sgd_clf = SGDClassifier(random_state=42) sgd_clf.fit(x_train, y_train_5) # enable the model print(sgd_clf.predict([some_digit])) # implement Cross-Validation from sklearn.model_selection import StratifiedKFold from sklearn.base import clone skfolds = StratifiedKFold(n_splits=3, random_state=42) for train_index, test_index in skfolds.split(x_train, y_train_5): clone_clf = clone(sgd_clf) # train clone on training folds, then predict on test fold x_train_folds = x_train[train_index] y_train_folds = y_train_5[train_index] x_test_fold = x_train[test_index] y_test_fold = y_train_5[test_index] clone_clf.fit(x_train_folds, y_train_folds) y_pred = clone_clf.predict(x_test_fold) n_correct = sum(y_pred == y_test_fold) print(n_correct / len(y_pred)) # evaluate the model with 'accuracy' from sklearn.model_selection import cross_val_score cross_val_score = cross_val_score(sgd_clf, x_train, y_train_5, cv=3, scoring="accuracy") print(cross_val_score) # see accuracy from a non5classifier from sklearn.base import BaseEstimator class Never5Classifier(BaseEstimator): def fit(self, x, y=None): pass def predicit(self, x): return np.zeros((len(x), 1), dtype=bool) never_5_clf = Never5Classifier() never_5_clf_score = cross_val_score(never_5_clf, x_train, y_train_5, cv=3, scoring="accuracy") print(never_5_clf_score) # evaluate the model with 'confusion matrix' from sklearn.model_selection import cross_val_predict from sklearn.metrics import confusion_matrix y_train_pred = cross_val_predict(sgd_clf, x_train, y_train_5, cv=3) confusion_matrix = confusion_matrix(y_train_5, y_train_pred) print(confusion_matrix) # precision and recall from sklearn.metrics import precision_score, recall_score, f1_score precision_score = precision_score(y_train_5, y_train_pred) recall_score = recall_score(y_train_5, y_train_pred) f1_score = f1_score(y_train_5, y_train_pred) print(precision_score) print(recall_score) print(f1_score) # f1 score is the harmonic mean of precision and recall # precision vs recall trade-off from sklearn.metrics import precision_recall_curve def plot_precision_recall_vs_threshold(precisions, recalls, thresholds): plt.plot(thresholds, precisions[:-1], "b--", label="precision") # function to plot precision vs threshold plt.plot(thresholds, recalls[:-1], "g-", label="recall") plt.xlabel("Threshold", fontsize=16) plt.legend(loc="upper left", fontsize=16) plt.ylim([0, 1]) def plot_precision_vs_recall(precisions, recalls): plt.plot(recalls, precisions, "b-", linewidth=2) plt.xlabel("recall", fontsize=16) plt.ylabel("precision", fontsize=16) plt.axis([0, 1, 0, 1]) y_scores = cross_val_predict(sgd_clf, x_train, y_train, cv=3, method="decision_function") # return decision value if y_scores.ndim == 2: y_scores = y_scores[:, 1] # to get around with the issue of "extra first dimension" precisions, recalls, thresholds = precision_recall_curve(y_train_5, y_scores) plot_precision_recall_vs_threshold(precisions, recalls, thresholds) plot_precision_vs_recall(precisions, recalls) plt.show() # manly set the threshold y_train_pred_90 = (y_scores > 70000) # gain new trained dataset precision_score = precision_score(y_train_5, y_train_pred_90) recall_score = recall_score(y_train_5, y_train_pred_90) print("precision_score=", precision_score) print("recall_score=", recall_score) # ROC curve from sklearn.metrics import roc_curve fpr, tpr, thresholds = roc_curve(y_train_5, y_scores) def plot_roc_curve(fpr, tpr, label=None): plt.plot(fpr, tpr, linewidth=2, label=label) plt.plot([0, 1], [0, 1], 'k--') plt.axis([0, 1, 0, 1]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plot_roc_curve(fpr, tpr) plt.show() from sklearn.metrics import roc_auc_score from sklearn.ensemble import RandomForestClassifier forest_clf = RandomForestClassifier(random_state=42) y_probas_forest = cross_val_predict(forest_clf, x_train, y_train_5, cv=3, method="predict_proba") # have no decision_function y_scores_forest = y_probas_forest[:, 1] # extract the score from probability metrics fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_5, y_scores_forest) plt.plot(fpr, tpr, "b:", label="SGD") plot_roc_curve(fpr_forest, tpr_forest, "random Forest") plt.legend(loc="lower right") plt.show() roc_auc_score = roc_auc_score(y_train_5, y_scores_forest) print(roc_auc_score) # Multiclass classification sgd_clf.fit(x_train, y_train) # train the model to the all set. sgd_clf.predict([some_digit]) some_digit_score = sgd_clf.decision_function([some_digit]) # obtain score for each class print(some_digit_score) # OvO classifier from sklearn.multiclass import OneVsOneClassifier ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42)) ovo_clf.fit(x_train, y_train) print(ovo_clf.predict([some_digit])) forest_clf.fit(x_train, y_train) print(forest_clf.predict_proba([some_digit])) sgd_clf_score = cross_val_score(sgd_clf, x_train, y_train, cv=3, scoring="accuracy") print(sgd_clf_score) # here the score is for multiclass classification as for y_train from sklearn.preprocessing import StandardScaler scaler = StandardScaler() x_train_scaled = scaler.fit_transform(x_train.astype(np.float64)) sgd_clf_score(sgd_clf, x_train_scaled, y_train, cv=3, scoring="accuracy") print(sgd_clf_score) # scaling can improve the accuracy for model # error analysis y_train_pred = cross_val_predict(sgd_clf, x_train_scaled, y_train, cv=3) conf_mx = confusion_matrix(y_train, y_train_pred) # row for actual, column for predicted print(conf_mx) plt.matshow(conf_mx, cmap=plt.cm.gray) # showing the matrix with a image plt.show() row_sums = conf_mx.sum(axis=1, keepdims=True) norm_conf_mx = conf_mx / row_sums # transform error number into error rate np.fill_diagonal(norm_conf_mx, 0) # keep only the errors plt.matshow(norm_conf_mx, cmap=plt.cm.gray) plt.show() # multilabel classification from sklearn.neighbors import KNeighborsClassifier y_train_large = (y_train >= 7) y_train_odd = (y_train % 2 == 1) # imply odd number in this way y_multilabel = np.c_[y_train_large, y_train_odd] knn_clf = KNeighborsClassifier() # KNeighborClassifier for multilabel knn_clf.fit(x_train, y_multilabel) print(knn_clf.predcit([some_digit])) # multioutput classification import numpy.random as rnd noise1 = rnd.randint(0, 100, len(x_train), 784) noise2 = rnd.randint(0, 100, (len(x_train), 784)) # grant noise and try to clean x_train_mod = x_train +noise1 x_test_mod = x_test + noise2 y_train_mod = x_train y_test_mod = x_test knn_clf.fit(x_train_mod, y_train_mod) clean_digit = knn_clf.predict([x_test_mod[1]]) plot_digit(clean_digit)
sessions=session, smoothing_fwhm=4, memory="nilearn_cache", memory_level=1) X = nifti_masker.fit_transform(dataset_files.func) X = X[non_rest] ### Predictor ################################################################# ### Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest, f_classif from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier from sklearn.pipeline import Pipeline svc_ovo = OneVsOneClassifier(Pipeline([ ('anova', SelectKBest(f_classif, k=500)), ('svc', SVC(kernel='linear')) ])) svc_ova = OneVsRestClassifier(Pipeline([ ('anova', SelectKBest(f_classif, k=500)), ('svc', SVC(kernel='linear')) ])) ### Cross-validation scores ################################################### from sklearn.cross_validation import cross_val_score cv_scores_ovo = cross_val_score(svc_ovo, X, y, cv=5, verbose=True) cv_scores_ova = cross_val_score(svc_ova, X, y, cv=5, verbose=True) print 79 * "_"
# Convert string data to numerical data label_encoder = [] X_encoded = np.empty(X.shape) for i, item in enumerate(X[0]): if item.isdigit(): X_encoded[:, i] = X[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i]) X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) # Create SVM classifier classifier = OneVsOneClassifier(LinearSVC(random_state=0)) # Train the classifier classifier.fit(X, y) # Cross validation X_train, X_test, y_train, y_test = cross_validation.train_test_split( X, y, test_size=0.2, random_state=5) classifier = OneVsOneClassifier(LinearSVC(random_state=0)) classifier.fit(X_train, y_train) y_test_pred = classifier.predict(X_test) # Compute the F1 score of the SVM classifier f1 = cross_validation.cross_val_score(classifier, X, y,
s = StandardScaler() s.fit(X_train) X_train = s.transform(X_train) X_test = s.transform(X_test) ## ################################# ## Teil 3: One-vs-All (automatisch) ## ################################# """ One-vs-All: Sklearn hat automatisch erkannt, wir möchten hier mehrere Klassen vorhersagen - daher wird per default die One-vs-all-Methode verwendet. """ from sklearn.linear_model import LogisticRegression model = LogisticRegression(solver='lbfgs', multi_class='auto') model.fit(X_train, y_train) print(model.score(X_test, y_test)) ## ################### ## Teil 4: One-vs-One ## ################### from sklearn.multiclass import OneVsOneClassifier model = OneVsOneClassifier(LogisticRegression(solver='lbfgs')) model.fit(X_train, y_train) print(model.score(X_test, y_test))
def test_ovo_partial_fit_predict(): temp = datasets.load_iris() X, y = temp.data, temp.target ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:100], y[:100], np.unique(y)) ovo1.partial_fit(X[100:], y[100:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) ovo2.fit(X, y) pred2 = ovo2.predict(X) assert len(ovo1.estimators_) == n_classes * (n_classes - 1) / 2 assert np.mean(y == pred1) > 0.65 assert_almost_equal(pred1, pred2) # Test when mini-batches have binary target classes ovo1 = OneVsOneClassifier(MultinomialNB()) ovo1.partial_fit(X[:60], y[:60], np.unique(y)) ovo1.partial_fit(X[60:], y[60:]) pred1 = ovo1.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred1, pred2) assert len(ovo1.estimators_) == len(np.unique(y)) assert np.mean(y == pred1) > 0.65 ovo = OneVsOneClassifier(MultinomialNB()) X = np.random.rand(14, 2) y = [1, 1, 2, 3, 3, 0, 0, 4, 4, 4, 4, 4, 2, 2] ovo.partial_fit(X[:7], y[:7], [0, 1, 2, 3, 4]) ovo.partial_fit(X[7:], y[7:]) pred = ovo.predict(X) ovo2 = OneVsOneClassifier(MultinomialNB()) pred2 = ovo2.fit(X, y).predict(X) assert_almost_equal(pred, pred2) # raises error when mini-batch does not have classes from all_classes ovo = OneVsOneClassifier(MultinomialNB()) error_y = [0, 1, 2, 3, 4, 5, 2] message_re = escape("Mini-batch contains {0} while " "it must be subset of {1}".format( np.unique(error_y), np.unique(y))) assert_raises_regexp(ValueError, message_re, ovo.partial_fit, X[:7], error_y, np.unique(y)) # test partial_fit only exists if estimator has it: ovr = OneVsOneClassifier(SVC()) assert not hasattr(ovr, "partial_fit")
def return_data(request): clf = sklearn.svm.LinearSVC() training_files = sklearn.datasets.load_files( "/../../../SVM/dataset_training") f = open("/../../../SVM/dataset_prediction/test/lol.txt", 'w') fil = open("/../../../SVM/data_print", 'a') fil.write("\n") text = request.POST.get('text') fil.close() f.write(text) f.close() #print "Text ",text #print training_files.data predict_files = sklearn.datasets.load_files( "/../../../SVM/dataset_prediction") #print "Predict",predict_files.data vectorizer = TfidfVectorizer(encoding='utf-8') X_t = vectorizer.fit_transform( (open(f).read() for f in training_files.filenames)) #print("n_samples: %d, n_features: %d" % X_t.shape) assert sp.issparse(X_t) X_p = vectorizer.transform( (open(f).read() for f in predict_files.filenames)) #print X_p clf.fit(X_t, training_files.target) y_predicted = "" y_predicted = clf.predict(X_p) #print "OUT",y_predicted if y_predicted[0] == 0: f1 = open("/../../../SVM_Multi/dataset_prediction/test/lol.txt", 'w') f1.write(text) f1.close() cn = 0 with open("/../../../SVM/pande.txt") as f: #print "HOLA1",text for line in f: #print "HOLA2", line if (text == line.strip("\n")): #print "HOLA3" #print line cn = 1 if (cn == 0): num = random.randint(0, 100000000) fl = open( "/../../../SVM/dataset_training/bully/" + str(num) + ".txt", 'w') fl.write(text) fl.close() f3 = open("/../../../SVM/pande.txt", 'a') f3.write("\n" + text) f3.close() fl = open("/../../../SVM_Multi/dataset_prediction/lol.txt", 'w') fl.write(text) fl.close() clf = sklearn.svm.LinearSVC() training_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_training") #print training_files.data predict_files = sklearn.datasets.load_files( "/../../../SVM/dataset_prediction") vectorizer = TfidfVectorizer(encoding='utf-8') X_t = vectorizer.fit_transform( (open(f).read() for f in training_files.filenames)) assert sp.issparse(X_t) X_p = vectorizer.transform( (open(f).read() for f in predict_files.filenames)) y = OneVsOneClassifier(LinearSVC(random_state=0)).fit( X_t, training_files.target).predict(X_p) if (y[0] == 0): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/1/" + str(num) + ".txt", 'w') fl.write(text) fl.close() elif (y[0] == 1): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/2/" + str(num) + ".txt", 'w') fl.write(text) fl.close() elif (y[0] == 2): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/3/" + str(num) + ".txt", 'w') fl.write(text) fl.close() os.system("rm /../../../SVM_Multi/dataset_prediction/test/lol.txt~") clf = sklearn.svm.LinearSVC() training_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_training") predict_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_prediction") vectorizer = TfidfVectorizer(encoding='utf-8') X_t = vectorizer.fit_transform( (open(f).read() for f in training_files.filenames)) assert sp.issparse(X_t) X_p = vectorizer.transform( (open(f).read() for f in predict_files.filenames)) y1 = OneVsOneClassifier(LinearSVC(random_state=0)).fit( X_t, training_files.target).predict(X_p) if y1 == 0: fil = open("/home/ubuntu/Desktop/SVM/optional_data_print", 'a') fil.write(text) fil.write("\n") fil.close() return render( request, 'output.html', { 'pred': "100 friends will view this post. Our system has detected harmful content which might hurt the users sentiments.Are you sure you want to post this ?", 'val': True, 'text': text, 'l': False }) elif y1 == 1: return render( request, 'output.html', { 'pred': "You have been temporarily banned till the moderator checks this post.Our system has detected harmful content which might hurt the users sentiments. You cannot post another message until then. You can still continue to surf. You will be redirected to depression chat room for online help. ", 'val': False, 'l': True }) elif y1 == 2: return render( request, 'output.html', { 'pred': "Our system has detected some very harmful content in your post which might hurt the users sentiments. Keeping this in mind your posting privileges have been suspended for a week . You cannot post another message until then. You can still continue to surf. Repeated posting of such highly offensive content will lead to a report being generated and sent to the concerned authorities. You will be redirected to depression chat room for online help.", 'val': False, 'l': True }) else: tweets = tweet_dict("/../../../SVM/dataset_prediction/test/lol.txt") sentiment = sentiment_dict("/../../../SentiNet/AFINN-111.txt") for index in range(len(tweets)): tweet_word = tweets[index].split() sent_score = 0 # sentiment score della frase for word in tweet_word: word = word.rstrip('?:!.,;"!@') word = word.replace("\n", "") if not (word.encode('utf-8', 'ignore') == ""): if word.encode('utf-8') in sentiment.keys(): sent_score = sent_score + float(sentiment[word]) if (sent_score < 0): f1 = open("/../../../SVM_Multi/dataset_prediction/test/lol.txt", 'w') f1.write(text) f1.close() cn = 0 with open("/../../../SVM/pande.txt") as f: for line in f: if (text == line.strip("\n")): cn = 1 if (cn == 0): num = random.randint(0, 100000000) fl = open( "/../../../SVM/dataset_training/bully/" + str(num) + ".txt", 'w') fl.write(text) fl.close() f3 = open("/../../../SVM/pande.txt", 'a') f3.write("\n" + text) f3.close() fl = open("/../../../SVM_Multi/dataset_prediction/lol.txt", 'w') fl.write(text) fl.close() clf = sklearn.svm.LinearSVC() training_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_training") predict_files = sklearn.datasets.load_files( "/../../../SVM/dataset_prediction") vectorizer = TfidfVectorizer(encoding='utf-8') X_t = vectorizer.fit_transform( (open(f).read() for f in training_files.filenames)) assert sp.issparse(X_t) X_p = vectorizer.transform( (open(f).read() for f in predict_files.filenames)) y = OneVsOneClassifier(LinearSVC(random_state=0)).fit( X_t, training_files.target).predict(X_p) if (y[0] == 0): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/1/" + str(num) + ".txt", 'w') fl.write(text) fl.close() elif (y[0] == 1): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/2/" + str(num) + ".txt", 'w') fl.write(text) fl.close() elif (y[0] == 2): num = random.randint(0, 100000000) fl = open( "/../../../SVM_Multi/dataset_training/3/" + str(num) + ".txt", 'w') fl.write(text) fl.close() os.system( "rm /../../../SVM_Multi/dataset_prediction/test/lol.txt~") clf = sklearn.svm.LinearSVC() training_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_training") predict_files = sklearn.datasets.load_files( "/../../../SVM_Multi/dataset_prediction") vectorizer = TfidfVectorizer(encoding='utf-8') X_t = vectorizer.fit_transform( (open(f).read() for f in training_files.filenames)) assert sp.issparse(X_t) X_p = vectorizer.transform( (open(f).read() for f in predict_files.filenames)) y1 = OneVsOneClassifier(LinearSVC(random_state=0)).fit( X_t, training_files.target).predict(X_p) if y1 == 0: fil = open("/../../../SVM/optional_data_print", 'a') fil.write(text) fil.write("\n") fil.close() return render( request, 'output.html', { 'pred': "100 friends will view this post. Our system has detected harmful content which might hurt the users sentiments.Are you sure you want to post this ?", 'val': True, 'text': text, 'l': False }) elif y1 == 1: return render( request, 'output.html', { 'pred': "You have been temporarily banned till the moderator checks this post.Our system has detected harmful content which might hurt the users sentiments. You cannot post another message until then. You can still continue to surf. You will be redirected to depression chat room for online help.", 'val': False, 'l': True }) elif y1 == 2: return render( request, 'output.html', { 'pred': "Our system has detected some very harmful content in your post which might hurt the users sentiments. Keeping this in mind your posting privileges have been suspended for a week . You cannot post another message until then. You can still continue to surf. Repeated posting of such highly offensive content will lead to a report being generated and sent to the concerned authorities. You will be redirected to depression chat room for online help.", 'val': False, 'l': True }) else: fil = open("/../../../SVM/data_print", 'a') fil.write(text) fil.close() return HttpResponseRedirect("http://127.0.0.1:8000/home/form/")
memory="nilearn_cache", memory_level=1) X = nifti_masker.fit_transform(dataset_files.func) X = X[non_rest] ### Predictor ################################################################# ### Define the prediction function to be used. # Here we use a Support Vector Classification, with a linear kernel from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest, f_classif from sklearn.multiclass import OneVsOneClassifier, OneVsRestClassifier from sklearn.pipeline import Pipeline svc_ovo = OneVsOneClassifier( Pipeline([('anova', SelectKBest(f_classif, k=500)), ('svc', SVC(kernel='linear'))])) svc_ova = OneVsRestClassifier( Pipeline([('anova', SelectKBest(f_classif, k=500)), ('svc', SVC(kernel='linear'))])) ### Cross-validation scores ################################################### from sklearn.cross_validation import cross_val_score cv_scores_ovo = cross_val_score(svc_ovo, X, y, cv=5, verbose=True) cv_scores_ova = cross_val_score(svc_ova, X, y, cv=5, verbose=True) print 79 * "_" print 'OvO', cv_scores_ovo.mean()
avg / total 0.77 0.77 0.76 3313 [[ 296 0 6 78 2 0 14] [ 5 17 18 19 0 0 9] [ 13 1 314 34 9 0 87] [ 45 5 49 1281 23 2 86] [ 8 3 22 57 87 2 27] [ 0 1 10 15 2 7 28] [ 6 0 4 72 3 1 545]] ''' from sklearn.multiclass import OneVsOneClassifier from sklearn.svm import LinearSVC Model = build_and_evaluate(X=data['text'], y=data['class'], classifier=OneVsOneClassifier(LinearSVC()), ngram_range=(1, 3), test_size=0.4) #%% ''' Building for evaluation Classification Report: precision recall f1-score support course 0.80 0.66 0.73 384 department 0.55 0.15 0.24 73 faculty 0.71 0.63 0.67 430 other 0.80 0.86 0.82 1511 project 0.69 0.42 0.52 205 staff 0.44 0.07 0.12 55
'misc.forsale', 'soc.religion.christian' ]), print '==================================================================\n' print 'Confusion Matrix:' print '===================' print metrics.confusion_matrix(target, predicted) print '===================\n' print 'Total Accuracy: ' print np.mean(target == predicted) clf_list = [ OneVsOneClassifier(GaussianNB()), OneVsOneClassifier(svm.LinearSVC()), OneVsRestClassifier(GaussianNB()), OneVsRestClassifier(svm.LinearSVC()) ] clf_name = [ 'One vs One Classifier - Naive Bayes', 'One vs One Classifier - SVM', 'One vs Rest Classifier - Naive Bayes', 'One vs Rest Classifier - SVM' ] # perform classification for clf, clf_n in zip(clf_list, clf_name): pound_sign = '' spaces = '' for i in range(len(clf_n) + 2): pound_sign += '#'
y_pred_gnb, name="Multiclass Gaussian Naive Bayes", average='weighted') gnb_cm = confusion_matrix( y_test, y_pred_gnb) # Multiclass Gaussian Naive Bayes confusion matrix plt.figure() plot_confusion_matrix(gnb_cm, classes=class_names, title='Multiclass Gaussian Naive Bayes Confusion Matrix' ) # Multiclass Gaussian Naive Bayes roc curve params = { 'estimator__C': [0.001, 0.01, 0.1, 1, 10, 100, 1000] } # parameters to try for best gamma in Linear SVC svm_one = OneVsOneClassifier( LinearSVC(random_state=42)) # One v One SVM classifier clf_one = GridSearchCV(svm_one, params, cv=5, scoring='accuracy') # grid search to find best gamma y_pred_one = clf_one.fit(X_train_LSI, y_train).best_estimator_.predict(X_test_LSI) print(clf_one.best_estimator_) print_classifier_metrics(y_test, y_pred_one, name="1v1 SVM", average='weighted') one_cm = confusion_matrix(y_test, y_pred_one) # One v One SVM confusion matrix plt.figure() plot_confusion_matrix(one_cm, classes=class_names, title='1v1 SVM Confusion Matrix')
# Convert string data to numerical data label_encoder = [] X_encoded = np.empty(X.shape) for i, item in enumerate(X[0]): if item.isdigit(): X_encoded[:, i] = X[:, i] else: label_encoder.append(preprocessing.LabelEncoder()) X_encoded[:, i] = label_encoder[-1].fit_transform(X[:, i]) X = X_encoded[:, :-1].astype(int) y = X_encoded[:, -1].astype(int) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5) # Create SVM classifier classifier = OneVsOneClassifier(LinearSVC(random_state=0)) # Train the classifier # Cross validation classifier.fit(X_train, y_train) y_test_pred = classifier.predict(X_test) # Compute the F1 score of the SVM classifier f1 = cross_val_score(classifier, X, y, scoring='f1_weighted', cv=3) print("F1 score: " + str(round(100 * f1.mean(), 2)) + "%")