# Preparing data data = np.genfromtxt('final.csv', delimiter=",") #data = np.delete(data, 6, 1) # Rain #data = np.delete(data, 5, 1) # Wind Speed #data = np.delete(data, 4, 1) # Temperature #data = np.delete(data, 3, 1) # Elevation #data = np.delete(data, 2, 1) # Longitude #data = np.delete(data, 1, 1) # Latitude X = data[1:data.shape[0] - 1, 1:data.shape[1] - 1] Y = data[1:data.shape[0] - 1, data.shape[1] - 1:data.shape[1]] / 100 Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=0.2, random_state=1) Xtr, params = rescale(Xtr) Xte, _ = rescale(Xte, params) # Evaluating performance of models ''' hidden_layers = range(2, 8, 1) nodes_nums = range(2, 8, 1) errors_tr = np.zeros((len(hidden_layers),len(nodes_nums))) errors_te = np.zeros((len(hidden_layers),len(nodes_nums))) for i in range(len(hidden_layers)): hidden_layer = hidden_layers[i] for j in range(len(nodes_nums)): nodes_num=[] for a in range(i):
from sklearn.utils import resample from collections import Counter X = np.genfromtxt( 'C:\\Users\\radad\\OneDrive\\Desktop\\cs178\\CS178-Kaggle-Competition\\X_train.txt', delimiter=None) Y = np.genfromtxt( 'C:\\Users\\radad\\OneDrive\\Desktop\\cs178\\CS178-Kaggle-Competition\\Y_train.txt', delimiter=None) X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=10) XtrS, params = ml.rescale(X_train) Xvas, _ = ml.rescale(X_test, params) from imblearn.under_sampling import RandomUnderSampler rus = RandomUnderSampler(random_state=42) X_res, y_res = rus.fit_resample(XtrS, y_train) mlp = MLPClassifier(solver='sgd', max_iter=2000) mlp.hidden_layer_sizes = (100, 100, 100) mlp.activation = 'logistic' mlp.learning_rate_init = 0.1 mlp.learning_rate = 'adaptive' mlp.verbose = True mlp.fit(X_res, y_res)
import numpy as np import matplotlib.pyplot as plt import mltools as ml import mltools.logistic2 as lc2 iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, params = ml.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2 # (a) Scatter plot of the two classes to exhibit seperability plt.title('Linearly Seperable Data') plt.scatter(XA[:, 0], XA[:, 1], c=YA) plt.show() plt.title('Linearly Non-seprabale Data') plt.scatter(XB[:, 0], XB[:, 1], c=YB) plt.show() # (b) Plotting a boundary with the class data points, by modifying plotBoundary() learner = lc2.logisticClassify2() # Initializing the logisic classifier learner.classes = np.unique(YA) # Picking uniqe values as the class labels wts = np.zeros(shape=(1, 3)) wts[0, :] = [0.5, 1, -0.25] # Assigning weights learner.theta = wts learner.plotBoundary(XA, YA) # Plotting decision boundary # Performing above actions for the XB-YB split of the data learner = lc2.logisticClassify2()
import numpy as np import matplotlib.pyplot as plt import mltools as ml import cPickle as pickle from sklearn.decomposition import PCA X = np.genfromtxt("data/X_train.txt", delimiter=None) Y = np.genfromtxt("data/Y_train.txt", delimiter=None) # X, Y = ml.shuffleData(X,Y) X, _ = ml.rescale(X) components = range(1, 15, 1) for component in components: print "=" * 50 print "Number of Components = ", component pca = PCA(n_components=8) X_pca = pca.fit_transform(X) Xtr, Ytr = X_pca[:180000, :], Y[:180000] Xval, Yval = X_pca[180000:, :], Y[180000:] bags = [1, 5, 10, 25, 45, 60, 75] bagTrainError = [] bagValidationError = [] ensembles = [] for bag in bags: print 'Training ', bag, ' decision tree(s)' decisionTrees = [None] * bag trainingError = [] for i in range(0, bag, 1): # Drawing a random training sample every single time Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=180000)
## Problem 1 ## ## part a ## import numpy as np import mltools as ml import matplotlib.pyplot as plt iris = np.genfromtxt("data/iris.txt", delimiter=None) X, Y = iris[:, 0:2], iris[:, -1] # get first two features & target X, Y = ml.shuffleData(X, Y) # reorder randomly (important later) X, _ = ml.rescale(X) # works much better on rescaled data XA, YA = X[Y < 2, :], Y[Y < 2] # get class 0 vs 1 XB, YB = X[Y > 0, :], Y[Y > 0] # get class 1 vs 2' X0, Y0 = X[Y == 0, :], Y[Y == 0] #class 0 X1, Y1 = X[Y == 1, :], Y[Y == 1] #class 1 X2, Y2 = X[Y == 2, :], Y[Y == 2] #class 2 plt.scatter(X0[:, 0], X0[:, 1], c='Blue') plt.scatter(X1[:, 0], X1[:, 1], c="Red") plt.close() plt.scatter(X1[:, 0], X1[:, 1], c='Blue') plt.scatter(X2[:, 0], X2[:, 1], c="Red") plt.close() ## part b ## from logisticClassify2 import *
import numpy as np import matplotlib.pyplot as plt import mltools as ml X = np.genfromtxt('data/X_train.txt', delimiter=None) Y = np.genfromtxt('data/Y_train.txt', delimiter=None) X, Y = ml.shuffleData(X, Y) percentage = 5 ndata = np.int((percentage / 100) * X.shape[0]) print(f"Training with {ndata}({100*ndata/X.shape[0]:.1f}%) points") Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) Xt, Yt = Xtr[:ndata], Ytr[:ndata] print(f"Rescale Xt") XtS, params = ml.rescale(Xt) print(f"Rescale Xva") XvS, _ = ml.rescale(Xva, params) reg = [1e-4, 2e-4, 3e-4, 4e-4, 1e-3, 2e-3, 3e-3, 6e-3, 1e-2, 1e-1, 2e-1, 3e-1] print(f"Regularization coefficients {reg}") learners = [ml.linearC.linearClassify() for r in reg] last_lr = 1 for i, r in enumerate(reg): if i > 0: learners[i].theta = learners[i].theta.copy() print(f"Training {i} - reg = {r}") jsur, lr, epoch = learners[i].train(XtS, Yt, reg=r, initStep=last_lr, stopTol=1e-5, minlr=1e-8, stopIter=300, rate_decay=0.7, patience=2) last_lr = lr[-1] / (0.7 ** 5) plt.subplot(2, 1, 1) plt.plot(range(epoch), jsur) plt.title(f"Jsur {i} - reg = {r}")
mean = [np.mean(X[:, feature]) for feature in range(X.shape[1])] variance = [np.var(X[:, feature]) for feature in range(X.shape[1])] print("Minimum of the featurs: \n{}\n".format(minimum)) print("Maximum of the featurs: \n{}\n".format(maximum)) print("Mean of the featurs: \n{}\n".format(mean)) print("Variance of the featurs: \n{}\n".format(variance)) # %% [markdown] # # 2.2 Split the dataset, and rescale each into training and validation. # %% Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) Xt, Yt = Xtr[:5000], Ytr[:5000] # subsample for efficiency (you can go higher) XtS, params = ml.rescale(Xt) # Normalize the features XvS, _ = ml.rescale(Xva, params) # Normalize the features minimum = [min(XtS[:, feature]) for feature in range(XtS.shape[1])] maximum = [max(XtS[:, feature]) for feature in range(XtS.shape[1])] mean = [np.mean(XtS[:, feature]) for feature in range(XtS.shape[1])] variance = [np.var(XtS[:, feature]) for feature in range(XtS.shape[1])] print("Minimum of the featurs: \n{}\n".format(minimum)) print("Maximum of the featurs: \n{}\n".format(maximum)) print("Mean of the featurs: \n{}\n".format(mean)) print("Variance of the featurs: \n{}\n".format(variance))
import numpy as np import matplotlib.pyplot as plt import mltools as ml import cPickle as pickle from sklearn.linear_model import LogisticRegression from sklearn.decomposition import PCA pca_flag = True X = np.genfromtxt("data/X_train.txt", delimiter = None) Y = np.genfromtxt("data/Y_train.txt", delimiter = None) X_test = np.genfromtxt("data/X_test.txt", delimiter = None) X_all = np.concatenate((X,X_test), axis=0) X_all, _ = ml.rescale(X_all) print 'Performing PCA Analysis' if pca_flag: pca = PCA(n_components=14) X_all_pca = pca.fit_transform(X_all) X_pca = X_all_pca[:200000,:] X_test = X_all_pca[200000:,:] models = [] validationErrors = [] features = [] for i in range(1,14,1): print 'Logistic Regression with ', (i+1), ' features' if not pca_flag:
print(f"Shape of X is {X.shape}") print(f"Shape of Y is {Y.shape}") for feature in range(X.shape[1]): print(f"Min/Max/Mean/Var of feature {feature+1:2d} are:", end="") print(f" {np.min(X[:,feature]):+07.3e}", end="/") print(f"{np.max(X[:,feature]):+07.3e}", end="/") print(f"{np.mean(X[:,feature]):+07.3e}", end="/") print(f"{np.var(X[:,feature]):+07.3e}") percentage = 20 ndata = np.int((percentage / 100) * X.shape[0]) print(f"Training with {ndata}({100*ndata/X.shape[0]:.1f}%) points") Xtr, Xva, Ytr, Yva = ml.splitData(X, Y) Xt, Yt = Xtr[:ndata], Ytr[:ndata] XtS, params = ml.rescale(Xt) XvS, _ = ml.rescale(Xva, params) # Normalize the features print(f"Shape of XtS is {XtS.shape}") print(f"Shape of XvS is {XvS.shape}") for feature in range(XtS.shape[1]): print(f"Min/Max/Mean/Var of feature {feature+1:2d} in XtS is:", end="") print(f" {np.min(XtS[:,feature]):+07.3e}", end="/") print(f"{np.max(XtS[:,feature]):+07.3e}", end="/") print(f"{np.mean(XtS[:,feature]):+07.3e}", end="/") print(f"{np.var(XtS[:,feature]):+07.3e}") print(" " * 34 + f"XvS is:", end="") print(f" {np.min(XvS[:,feature]):+07.3e}", end="/") print(f"{np.max(XvS[:,feature]):+07.3e}", end="/") print(f"{np.mean(XvS[:,feature]):+07.3e}", end="/") print(f"{np.var(XvS[:,feature]):+07.3e}")