# Preparing data
data = np.genfromtxt('final.csv', delimiter=",")

#data = np.delete(data, 6, 1)   # Rain
#data = np.delete(data, 5, 1)   # Wind Speed
#data = np.delete(data, 4, 1)   # Temperature
#data = np.delete(data, 3, 1)   # Elevation
#data = np.delete(data, 2, 1)   # Longitude
#data = np.delete(data, 1, 1)   # Latitude

X = data[1:data.shape[0] - 1, 1:data.shape[1] - 1]
Y = data[1:data.shape[0] - 1, data.shape[1] - 1:data.shape[1]] / 100
Xtr, Xte, Ytr, Yte = train_test_split(X, Y, test_size=0.2, random_state=1)
Xtr, params = rescale(Xtr)
Xte, _ = rescale(Xte, params)

# Evaluating performance of models
'''
hidden_layers = range(2, 8, 1)
nodes_nums = range(2, 8, 1)

errors_tr = np.zeros((len(hidden_layers),len(nodes_nums)))
errors_te = np.zeros((len(hidden_layers),len(nodes_nums)))

for i in range(len(hidden_layers)):
    hidden_layer = hidden_layers[i]
    for j in range(len(nodes_nums)):
        nodes_num=[]
        for a in range(i):
from sklearn.utils import resample
from collections import Counter

X = np.genfromtxt(
    'C:\\Users\\radad\\OneDrive\\Desktop\\cs178\\CS178-Kaggle-Competition\\X_train.txt',
    delimiter=None)
Y = np.genfromtxt(
    'C:\\Users\\radad\\OneDrive\\Desktop\\cs178\\CS178-Kaggle-Competition\\Y_train.txt',
    delimiter=None)

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.20,
                                                    random_state=10)

XtrS, params = ml.rescale(X_train)
Xvas, _ = ml.rescale(X_test, params)

from imblearn.under_sampling import RandomUnderSampler

rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(XtrS, y_train)

mlp = MLPClassifier(solver='sgd', max_iter=2000)
mlp.hidden_layer_sizes = (100, 100, 100)
mlp.activation = 'logistic'
mlp.learning_rate_init = 0.1
mlp.learning_rate = 'adaptive'
mlp.verbose = True

mlp.fit(X_res, y_res)
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import mltools.logistic2 as lc2

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, params = ml.rescale(X)  # works much better on rescaled data
XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2

# (a) Scatter plot of the two classes to exhibit seperability
plt.title('Linearly Seperable Data')
plt.scatter(XA[:, 0], XA[:, 1], c=YA)
plt.show()

plt.title('Linearly Non-seprabale Data')
plt.scatter(XB[:, 0], XB[:, 1], c=YB)
plt.show()

# (b) Plotting a boundary with the class data points, by modifying plotBoundary()
learner = lc2.logisticClassify2()  # Initializing the logisic classifier
learner.classes = np.unique(YA)  # Picking uniqe values as the class labels
wts = np.zeros(shape=(1, 3))
wts[0, :] = [0.5, 1, -0.25]  # Assigning weights
learner.theta = wts
learner.plotBoundary(XA, YA)  # Plotting decision boundary

# Performing above actions for the XB-YB split of the data
learner = lc2.logisticClassify2()
Esempio n. 4
0
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import cPickle as pickle
from sklearn.decomposition import PCA

X = np.genfromtxt("data/X_train.txt", delimiter=None)
Y = np.genfromtxt("data/Y_train.txt", delimiter=None)
# X, Y = ml.shuffleData(X,Y)
X, _ = ml.rescale(X)

components = range(1, 15, 1)
for component in components:
    print "=" * 50
    print "Number of Components = ", component
    pca = PCA(n_components=8)
    X_pca = pca.fit_transform(X)
    Xtr, Ytr = X_pca[:180000, :], Y[:180000]
    Xval, Yval = X_pca[180000:, :], Y[180000:]

    bags = [1, 5, 10, 25, 45, 60, 75]
    bagTrainError = []
    bagValidationError = []
    ensembles = []
    for bag in bags:
        print 'Training ', bag, ' decision tree(s)'
        decisionTrees = [None] * bag
        trainingError = []
        for i in range(0, bag, 1):
            # Drawing a random training sample every single time
            Xi, Yi = ml.bootstrapData(Xtr, Ytr, n_boot=180000)
Esempio n. 5
0
## Problem 1 ##

## part a ##

import numpy as np
import mltools as ml
import matplotlib.pyplot as plt

iris = np.genfromtxt("data/iris.txt", delimiter=None)
X, Y = iris[:, 0:2], iris[:, -1]  # get first two features & target
X, Y = ml.shuffleData(X, Y)  # reorder randomly (important later)
X, _ = ml.rescale(X)  # works much better on rescaled data

XA, YA = X[Y < 2, :], Y[Y < 2]  # get class 0 vs 1
XB, YB = X[Y > 0, :], Y[Y > 0]  # get class 1 vs 2'

X0, Y0 = X[Y == 0, :], Y[Y == 0]  #class 0
X1, Y1 = X[Y == 1, :], Y[Y == 1]  #class 1
X2, Y2 = X[Y == 2, :], Y[Y == 2]  #class 2

plt.scatter(X0[:, 0], X0[:, 1], c='Blue')
plt.scatter(X1[:, 0], X1[:, 1], c="Red")
plt.close()

plt.scatter(X1[:, 0], X1[:, 1], c='Blue')
plt.scatter(X2[:, 0], X2[:, 1], c="Red")
plt.close()

## part b ##

from logisticClassify2 import *
Esempio n. 6
0
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml

X = np.genfromtxt('data/X_train.txt', delimiter=None)
Y = np.genfromtxt('data/Y_train.txt', delimiter=None)
X, Y = ml.shuffleData(X, Y)
percentage = 5
ndata = np.int((percentage / 100) * X.shape[0])
print(f"Training with {ndata}({100*ndata/X.shape[0]:.1f}%) points")
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
Xt, Yt = Xtr[:ndata], Ytr[:ndata]
print(f"Rescale Xt")
XtS, params = ml.rescale(Xt)
print(f"Rescale Xva")
XvS, _ = ml.rescale(Xva, params)
reg = [1e-4, 2e-4, 3e-4, 4e-4, 1e-3, 2e-3, 3e-3, 6e-3, 1e-2, 1e-1, 2e-1, 3e-1]
print(f"Regularization coefficients {reg}")
learners = [ml.linearC.linearClassify() for r in reg]
last_lr = 1
for i, r in enumerate(reg):
    if i > 0:
        learners[i].theta = learners[i].theta.copy()
    print(f"Training {i} - reg = {r}")
    jsur, lr, epoch = learners[i].train(XtS, Yt, reg=r, initStep=last_lr, stopTol=1e-5, minlr=1e-8, stopIter=300,
                                        rate_decay=0.7, patience=2)
    last_lr = lr[-1] / (0.7 ** 5)

    plt.subplot(2, 1, 1)
    plt.plot(range(epoch), jsur)
    plt.title(f"Jsur {i} - reg = {r}")
Esempio n. 7
0
mean = [np.mean(X[:, feature]) for feature in range(X.shape[1])]

variance = [np.var(X[:, feature]) for feature in range(X.shape[1])]

print("Minimum of the featurs: \n{}\n".format(minimum))
print("Maximum of the featurs: \n{}\n".format(maximum))
print("Mean of the featurs: \n{}\n".format(mean))
print("Variance of the featurs: \n{}\n".format(variance))

# %% [markdown]
# # 2.2 Split the dataset, and rescale each into training and validation.

# %%
Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
Xt, Yt = Xtr[:5000], Ytr[:5000]  # subsample for efficiency (you can go higher)
XtS, params = ml.rescale(Xt)  # Normalize the features
XvS, _ = ml.rescale(Xva, params)  # Normalize the features

minimum = [min(XtS[:, feature]) for feature in range(XtS.shape[1])]

maximum = [max(XtS[:, feature]) for feature in range(XtS.shape[1])]

mean = [np.mean(XtS[:, feature]) for feature in range(XtS.shape[1])]

variance = [np.var(XtS[:, feature]) for feature in range(XtS.shape[1])]

print("Minimum of the featurs: \n{}\n".format(minimum))
print("Maximum of the featurs: \n{}\n".format(maximum))
print("Mean of the featurs: \n{}\n".format(mean))
print("Variance of the featurs: \n{}\n".format(variance))
import numpy as np
import matplotlib.pyplot as plt
import mltools as ml
import cPickle as pickle
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

pca_flag = True

X = np.genfromtxt("data/X_train.txt", delimiter = None)
Y = np.genfromtxt("data/Y_train.txt", delimiter = None)
X_test = np.genfromtxt("data/X_test.txt", delimiter = None)

X_all = np.concatenate((X,X_test), axis=0)
X_all, _ = ml.rescale(X_all)

print 'Performing PCA Analysis'

if pca_flag:
    pca = PCA(n_components=14)

    X_all_pca = pca.fit_transform(X_all)
    X_pca = X_all_pca[:200000,:]
    X_test = X_all_pca[200000:,:]

models = []
validationErrors = []
features = []
for i in range(1,14,1):
    print 'Logistic Regression with ', (i+1), ' features'
    if not pca_flag:
Esempio n. 9
0
    print(f"Shape of X is {X.shape}")
    print(f"Shape of Y is {Y.shape}")
    for feature in range(X.shape[1]):
        print(f"Min/Max/Mean/Var of feature {feature+1:2d} are:", end="")
        print(f" {np.min(X[:,feature]):+07.3e}", end="/")
        print(f"{np.max(X[:,feature]):+07.3e}", end="/")
        print(f"{np.mean(X[:,feature]):+07.3e}", end="/")
        print(f"{np.var(X[:,feature]):+07.3e}")

    percentage = 20
    ndata = np.int((percentage / 100) * X.shape[0])
    print(f"Training with {ndata}({100*ndata/X.shape[0]:.1f}%) points")

    Xtr, Xva, Ytr, Yva = ml.splitData(X, Y)
    Xt, Yt = Xtr[:ndata], Ytr[:ndata]
    XtS, params = ml.rescale(Xt)
    XvS, _ = ml.rescale(Xva, params)  # Normalize the features
    print(f"Shape of XtS is {XtS.shape}")
    print(f"Shape of XvS is {XvS.shape}")
    for feature in range(XtS.shape[1]):
        print(f"Min/Max/Mean/Var of feature {feature+1:2d} in XtS is:", end="")
        print(f" {np.min(XtS[:,feature]):+07.3e}", end="/")
        print(f"{np.max(XtS[:,feature]):+07.3e}", end="/")
        print(f"{np.mean(XtS[:,feature]):+07.3e}", end="/")
        print(f"{np.var(XtS[:,feature]):+07.3e}")

        print(" " * 34 + f"XvS is:", end="")
        print(f" {np.min(XvS[:,feature]):+07.3e}", end="/")
        print(f"{np.max(XvS[:,feature]):+07.3e}", end="/")
        print(f"{np.mean(XvS[:,feature]):+07.3e}", end="/")
        print(f"{np.var(XvS[:,feature]):+07.3e}")