def experimentCombinatorialCrossValidation(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results = {} for gamma in arange(0.5, 1.4, 0.15): for C in arange(0.5, 2.1, 0.25): print("\nProcessing Gamma:", gamma, "C:", C) average_score = [] for set in dev_sets: clf = SVM().construct_rbf_classifier(kernel='rbf', gamma=gamma, C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score.append(score) score = round(mean(average_score), 3) cv_results[[C, gamma]] = score print("Average F1 score for C:", str(C) + ".", score) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
def experimentDefaultSetting(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_train), 'Samples') clf.fit(x_train, y_train) training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
def experimentLinearKernel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None cv_results1 = {} cv_results2 = {} for C in arange(0.5, 2.25, 0.25): print("\nProcessing C:", C) average_score1 = [] average_score2 = [] for set in dev_sets: clf2 = SVM().construct_linear_classifier(penalty='l2', C=C) validation_set = set union_set = DataService().construct_union_set( set.copy(), dev_sets.copy()) # fit on the rest of the data clf2.fit(union_set[0], union_set[1]) # validate on validation set y_pred = clf2.predict(validation_set[0]) score = f1_score(y_true=validation_set[1], y_pred=y_pred, average='binary') average_score1.append(score) cv_results1[C] = mean(average_score1) score = round(mean(average_score2), 3) print("Average F1 score for CLF1:", round(mean(average_score1), 3)) print("Average F1 score for CLF2:", round(mean(average_score2), 3)) # save the best model and use that to classify the testset if score > best_accuracy: best_accuracy = score best_classifier = clf2 y_pred = best_classifier.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
class CrossValidation(): classificationAlgorithms = [ logisticRegression(), RandomForest(), SVM(), AdaBoost(), XGBoost() ] def __init__(self, dataset, X_train, X_test, y_train, y_test): self.ds = dataset self.X_train = X_train self.X_test = X_test self.y_train = y_train self.y_test = y_test self.accuracyDict = {} self.models = {} def run(self): for alg in self.classificationAlgorithms: results = alg.run(self.ds, self.X_train, self.X_test, self.y_train, self.y_test) #results incuding: the name of the algorithm and the model self.appendToAccuracyDict( results[0], self.kFoldCrossValidation(results[0], results[1])) self.appendModel(results[0], results[1]) def kFoldCrossValidation(self, algName, classifier): accuracies = cross_val_score(estimator=classifier, X=self.X_train, y=self.y_train, cv=300) accuracy = accuracies.mean() print algName + ' accuracy:', accuracy * 100, '%' return accuracy def appendToAccuracyDict(self, algName, accuracy): #tup[0]->algorithm name, tup[1]->accuracy self.accuracyDict[algName] = accuracy * 100 def appendModel(self, algName, model): #tup[0]->algorithm name, tup[1]->accuracy self.models[algName] = model def getAccuracyDict(self): return self.accuracyDict def getModel(self, name): return self.models[name]
def experimentBestModel(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_best_classifier() # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) dev_sets = DataService().cross_validation_split(x_train, y_train) best_accuracy = -inf best_classifier = None clf.fit(x_train, y_train) y_pred = clf.predict(x_test) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_test, average='macro'))
from SupportVectorMachine import SVM from sklearn.svm import SVC def test_classifier(cls, X_train, y_train, X_test, y_test): start = time() cls.fit(X_train, y_train) end = time() y_pred = cls.predict(X_test) print("Time:", end - start) print("Accuracy:", accuracy_score(y_true=y_test, y_pred=y_pred)) data = pd.read_csv('admission.csv', index_col="Serial No.")[:50] y = data['TOEFL Score'].to_numpy() del data['TOEFL Score'] scaler = StandardScaler() X = scaler.fit_transform(data.values) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=126) model = SVM() test_classifier(model, X_train, y_train, X_test, y_test) model = SVC(C=1.0, kernel='linear', tol=0.001) test_classifier(model, X_train, y_train, X_test, y_test)
def experimentFeatures(self, trainset, testset): print("Reading data") x, y = DataService().read_corpus(trainset) clf = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(x) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print('Fitting training data on', len(x_dev_train), 'Samples') clf.fit(x_train, y_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro')) coef = clf.coef_ def identity(x): return x vec = TfidfVectorizer(preprocessor=identity, tokenizer=identity) vec.fit_transform(x) names = vec.get_feature_names() coefs_and_features = list(zip(coef[0], names)) list_sorted_pos = sorted(coefs_and_features, key=lambda x: x[0], reverse=True) list_sorted_neg = sorted(coefs_and_features, key=lambda x: x[0]) features = [] for i in range(200): features.append(list_sorted_pos[i][1]) for i in range(200): features.append(list_sorted_neg[i][1]) print("\nneg", list_sorted_neg[:100], "\npos", list_sorted_pos[:100]) new_data = DataService().get_features_from_data(x, features) clf2 = SVM().construct_classifier("linear", 1.0) # Vectorize the text data and return an (n_samples, n_features) matrix. x_vec = DataService().vectorize_input(new_data) conversion_dict, y = DataService().labels_string_to_float(y) x_train, y_train, x_dev, y_dev, x_test, y_test = DataService( ).test_dev_train_split(x_vec, y) x_dev_train, y_dev_train, x_dev_test, y_dev_test = DataService( ).test_train_split(x_dev, y_dev) start_time = datetime.utcnow() print("\nTRIMMED DATA SET\n----------") print('Fitting training data on', len(x_dev_train), 'Samples') clf2.fit(x_dev_train, y_dev_train) non_zero = [] training_time = (datetime.utcnow() - start_time).seconds print("Training took", training_time, 'seconds..') y_pred = clf2.predict(x_dev_test) print("Accuracy score:", accuracy_score(y_pred=y_pred, y_true=y_dev_test)) print("F1 score (macro):", f1_score(y_pred=y_pred, y_true=y_dev_test, average='macro'))
from SupportVectorMachine import SVM import numpy as np features = np.array([[1, 7], [2, 8], [3, 8], [5, 1], [6, -1], [7, 3]]) labels = np.array([-1, -1, -1, 1, 1, 1]) clf = SVM() clf.fit(features, labels) predict_us = [[0, 10], [1, 3], [3, 4], [3, 5]] for p in predict_us: print(p, clf.predict(p))
import pandas as pd from SupportVectorMachine import SVM df = pd.read_csv("../datasets/iris.data", header=None) y = df.iloc[0:100, 4].values y = np.where(y == 'Iris-setosa', -1, 1) """ 0 = sepal length 1 = sepal width 2 = petal length 3 = petal width """ X = df.iloc[0:100, [0, 3]].values svm = SVM() svm.fit(X, y) def hyperplane(x, w, b, offset): return (-w[0] * x + b + offset) / w[1] plt.scatter(X[:50, 0], X[:50, 1], color='red', marker='o', label='setosa') plt.scatter(X[50:100, 0], X[50:100, 1], color='blue', marker='x', label='versicolor') x_max = np.amax(X[:, 0])
def fit(self,X,y): self.lrModel = LogisticRegression(self.lrAlpha,self.iterations) self.svmModel = SVM(self.svmAlpha,self.iterations) self.lrModel.fit(X,y) # Fitting independent and dependent feature in Logistic Regression self.svmModel.fit(X,y) # Fitting independent and dependent feature in Support Vector Machine Classifier