def test_binary_logistic_regression_gd(): t = np.array([[0.13, -0.12], [-3.07, 3.05]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X_bin, y_bin) np.testing.assert_almost_equal(lr.w_, t, 2) assert (y_bin == lr.predict(X_bin)).all()
def test_multi_logistic_regression_gd_acc(): lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) assert (y == lr.predict(X)).all()
def test_multi_logistic_probas(): lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) idx = [0, 50, 149] # sample labels: 0, 1, 2 y_pred = lr.predict_proba(X[idx]) exp = np.array([[1.0, 0.0, 0.0], [0.08, 0.60, 0.32], [0.0, 0.00, 0.99]]) np.testing.assert_almost_equal(y_pred, exp, 2)
def test_score_function(): lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) acc = lr.score(X, y) assert acc == 1.0, acc
def test_progress_3(): lr = SoftmaxRegression(epochs=1, eta=0.005, minibatches=1, print_progress=3, random_seed=1) lr.fit(X_bin, y_bin) # 0, 1 class
def test_progress_2(): lr = SoftmaxRegression(epochs=1, eta=0.005, minibatches=1, print_progress=2, random_seed=1) lr.fit(X_bin, y_bin) # 0, 1 class
def test_multi_logistic_regression_gd_weights(): t = np.array([[-0.95, -2.45, 3.4], [-3.95, 2.34, 1.59]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) np.testing.assert_almost_equal(lr.w_, t, 2)
def test_binary_logistic_regression_sgd(): t = np.array([[-0.68, 0.68], [-3.2, 3.2]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=len(y_bin), random_seed=1) lr.fit(X_bin, y_bin) # 0, 1 class np.testing.assert_almost_equal(lr.w_, t, 2) assert (y_bin == lr.predict(X_bin)).all()
def test_binary_logistic_regression_gd(): t = np.array([[-0.2, 0.2], [-3.09, 3.09]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X_bin, y_bin) np.testing.assert_almost_equal(lr.w_, t, 2) assert((y_bin == lr.predict(X_bin)).all())
def test_binary_l2_regularization_gd(): t = np.array([[-0.17, 0.17], [-2.26, 2.26]]) lr = SoftmaxRegression(epochs=200, eta=0.005, l2=1.0, minibatches=1, random_seed=1) lr.fit(X_bin, y_bin) np.testing.assert_almost_equal(lr.w_, t, 2) assert (y_bin == lr.predict(X_bin)).all()
def test_binary_logistic_regression_sgd(): t = np.array([[0.13, -0.12], [-3.06, 3.05]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=len(y_bin), random_seed=1) lr.fit(X_bin, y_bin) # 0, 1 class np.testing.assert_almost_equal(lr.w_, t, 2) assert (y_bin == lr.predict(X_bin)).all()
def test_refit_weights(): t = np.array([[0.13, -0.12], [-3.07, 3.05]]) lr = SoftmaxRegression(epochs=100, eta=0.005, minibatches=1, random_seed=1) lr.fit(X_bin, y_bin) w1 = lr.w_[0][0] w2 = lr.w_[0][0] lr.fit(X_bin, y_bin, init_params=False) assert w1 != lr.w_[0][0] assert w2 != lr.w_[1][0] np.testing.assert_almost_equal(lr.w_, t, 2)
def test_multi_logistic_probas(): lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) idx = [0, 50, 149] # sample labels: 0, 1, 2 y_pred = lr.predict_proba(X[idx]) exp = np.array([[0.99, 0.01, 0.00], [0.01, 0.88, 0.11], [0.00, 0.02, 0.98]]) np.testing.assert_almost_equal(y_pred, exp, 2)
def test_binary_l2_regularization_gd(): lr = SoftmaxRegression(eta=0.005, epochs=200, minibatches=1, l2_lambda=1.0, random_seed=1) lr.fit(X_bin, y_bin) y_pred = lr.predict(X_bin) expect_weights = np.array([[-0.316, 0.317], [-2.265, 2.265]]) np.testing.assert_almost_equal(lr.w_, expect_weights, 3) acc = sum(y_pred == y_bin) / len(y_bin) assert acc == 1.0
def test_binary_l2_regularization_gd(): lr = SoftmaxRegression(eta=0.005, epochs=200, minibatches=1, l2_lambda=1.0, random_seed=1) lr.fit(X_bin, y_bin) y_pred = lr.predict(X_bin) expect_weights = np.array([[-0.316, 0.317], [-2.265, 2.265]]) np.testing.assert_almost_equal(lr.w_, expect_weights, 3) acc = sum(y_pred == y_bin) / len(y_bin) assert(acc == 1.0)
from sklearn.model_selection import train_test_split from sklearn.metrics import classification_report, confusion_matrix from mlxtend.classifier import SoftmaxRegression from sklearn import datasets iris = datasets.load_iris() #X = iris.data[:, [2, 3]] X = iris.data y = iris.target X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0) ############# softmax Regresion ############### # Fitting softmax regression to the tranning set foft_regressor = SoftmaxRegression() foft_regressor.fit(X_train, y_train) # Predicting the test set result y_pred = foft_regressor.predict(X_test) print("############ softmax Regression ############") print(confusion_matrix(y_test, y_pred)) print(classification_report(y_test, y_pred))
y = data[:, data.shape[1] - 1] # Label - shape: 150, 1 X = data[:, 0:data.shape[1] - 1].astype(float) # Data - shape: 150, 4 X_train = X[0:105, :] #shape: 120, 4 X_test = X[105:X.shape[0], :] #30, 4 y_train = y[0:105] #shape: 120, 4 y_test = y[105:y.shape[0]] #30, 4 del data, X, y # Map label sang 0, 1, 2 classes = {'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2} y_train = [classes[item] for item in y_train] y_test = [classes[item] for item in y_test] y_train = np.asarray(y_train) y_test = np.asarray(y_test) # Softmax softmax = SoftmaxRegression(eta=1 / (10 ^ 4), epochs=500, minibatches=1, random_seed=0, print_progress=3) softmax.fit(X_train, y_train, init_params=True) """ plt.plot(range(len(softmax.cost_)), softmax.cost_) plt.xlabel('Iterations') plt.ylabel('Cost') plt.show() """ accuracy = softmax.score(X_test, y_test) print(accuracy)
def main(): #SETUP!!! train = 0.9 #percetage of data for training dev = 0.05 #percetage of data for development test = 0.05 #percetage of data for test n_features = 1500 #this could be adjusted later by the algorithm #this is setting the CountVectorizer from sklearn.feature_extraction.text vectorizer = CountVectorizer( min_df=20, #you may want to adjust this max_features=n_features, lowercase=False) DO_STANDARDIZE_DATA = 1 #1 yes, 0 no regularization_lambda = 0.1 ETA = 0.00005 EPOCHS = 50 model_sm = SoftmaxRegression( eta=ETA, epochs=EPOCHS, l2=regularization_lambda, #n_classes=U, minibatches=1, random_seed=1, print_progress=3) print("-----------------------------") print("METHOD - SOFTMAX REGRESSION") print("-----------------------------") print("Hello,\nwe will use Softmax Regression to classify twitter users\n") setpath() #get the users screen_names = get_users(FILE_USERS) info_data = get_info() U = len(screen_names) #number of users for i in range(U): print("For", screen_names[i], " one has ", info_data[i, 1], "tweets") if os.path.isfile(FOLDER + "/update_SM" + str(U) + ".txt") == True: with open(FOLDER + "/update_SM" + str(U) + ".txt", "r") as h: update = h.read() h.close() print("We load the dataset.") file = FOLDER + "/X_train_politic" + update + ".npy" with open(file, 'rb') as f: X_train = pickle.load(f) file = FOLDER + "/Y_train_politic" + update + ".npy" with open(file, 'rb') as f: Y_train = pickle.load(f) file = FOLDER + "/X_dev_politic" + update + ".npy" with open(file, 'rb') as f: X_dev = pickle.load(f) file = FOLDER + "/Y_dev_politic" + update + ".npy" with open(file, 'rb') as f: Y_dev = pickle.load(f) file = FOLDER + "/X_test_politic" + update + ".npy" with open(file, 'rb') as f: X_test = pickle.load(f) file = FOLDER + "/Y_test_politic" + update + ".npy" with open(file, 'rb') as f: Y_test = pickle.load(f) else: all_tweets = load_data() random.shuffle(all_tweets) random.shuffle( all_tweets) #Always shuffle your opponent cards when you play :) tweets = [] YY = [] for i in range(len(all_tweets)): tweets.append(all_tweets[i][2]) YY.append(all_tweets[i][0]) if len(tweets) == len(all_tweets): print("We load the data and we create the data set!") Y = np.array(YY) #this is the output label vector print("-----------------------------") m = len(tweets) X_train_1, x_appoggio, Y_train, y_appoggio = train_test_split( tweets, Y, test_size=(dev + test)) X_dev_1, X_test_1, Y_dev, Y_test = train_test_split( x_appoggio, y_appoggio, test_size=(test / (dev + test))) print("We will train with the", train * 100, " % of the data;") print(dev * 100, "% of the data is reserve for the method development;") print(test * 100, "% of the data is for the test.") vectorizer.fit(X_train_1) X_train = vectorizer.transform(X_train_1) X_dev = vectorizer.transform(X_dev_1) X_test = vectorizer.transform(X_test_1) if DO_STANDARDIZE_DATA == 0: print("We don't standardize data") else: print( "We will provide to the model with standardize data, mean zero and variance 1" ) X_train, X_dev, X_test = standardize_data(X_train, X_dev, X_test) del (all_tweets) del (X_train_1, X_dev_1, X_test_1, x_appoggio, y_appoggio) today = date.today() today_string = today.strftime("%y_%b_%d") #we save the data we have prepared with open( FOLDER + "/X_train_politic" + today_string + "_SM" + str(U) + ".npy", 'wb') as f: pickle.dump(X_train, f) file = FOLDER + "/X_train_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_train, f) file = FOLDER + "/Y_train_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_train, f) file = FOLDER + "/X_dev_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_dev, f) file = FOLDER + "/Y_dev_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_dev, f) file = FOLDER + "/X_test_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(X_test, f) file = FOLDER + "/Y_test_politic" + today_string + "_SM" + str( U) + ".npy" with open(file, "wb") as f: pickle.dump(Y_test, f) with open(FOLDER + "/update_SM" + str(U) + ".txt", "w") as h: h.write(today_string + "_SM" + str(U)) h.close() D = X_test.toarray().shape[1] #this is the lengh of the input vector print("\n") if n_features > D: n_features = D print("The # of features is", n_features) print("The regularization parameter is", regularization_lambda) print("The learning step is", ETA) print("The # of cycle is", EPOCHS) print("\n") #WE START TRAINING THE MODEL model_sm.fit(X_train.toarray(), Y_train) acc = model_sm.score(X_train.toarray(), Y_train) acc_dev = model_sm.score(X_dev.toarray(), Y_dev) print("\n") print("Accuracy on the training set", acc) print("Accuracy on the development set", acc_dev) #print some statistics about the model df_score, df_fp, df_pre = compute_accuracies(model_sm, 1, screen_names, X_train, X_dev, Y_train, Y_dev)
from mlxtend.data import iris_data from mlxtend.plotting import plot_decision_regions from mlxtend.classifier import SoftmaxRegression import matplotlib.pyplot as plt # Loading Data X, y = iris_data() X = X[:, [0, 3]] # sepal length and petal width # standardize X[:, 0] = (X[:, 0] - X[:, 0].mean()) / X[:, 0].std() X[:, 1] = (X[:, 1] - X[:, 1].mean()) / X[:, 1].std() lr = SoftmaxRegression(eta=0.01, epochs=500, minibatches=1, random_seed=1, print_progress=3) lr.fit(X, y) plot_decision_regions(X, y, clf=lr) plt.title('Softmax Regression - Gradient Descent') plt.show() plt.plot(range(len(lr.cost_)), lr.cost_) plt.xlabel('Iterations') plt.ylabel('Cost') plt.show()
def test_multi_logistic_regression_gd_weights(): t = np.array([[0.58, -3.72, 3.15], [-3.52, 3.21, 0.28]]) lr = SoftmaxRegression(epochs=200, eta=0.005, minibatches=1, random_seed=1) lr.fit(X, y) np.testing.assert_almost_equal(lr.w_, t, 2)