Exemple #1
0
def trainQuadratic(cvals, kernel):
    lamerrs = []
    lamclasserr = []
    lamtesterr = []
    
    min_error = 100
    c_choice = -1
    for i in range(0, len(cvals)):
        print(i)
        svm = SVM(kernel, cvals[i])
        svm.fit(X_train, y_train)
        testpred = svm.predict(X_test)
        trainpred = svm.predict(X_train)
        class_error = utils.classification_error(testpred, y_test)
        training_error = utils.classification_error(trainpred, y_train)
        lamclasserr.append(class_error)
        lamtesterr.append(training_error)
        lerr = lambdaError(kernel, cvals[i], folds_1a)
        if (lerr < min_error):
            min_error = lerr
            c_choice = cvals[i]
#         print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i])
        lamerrs.append(lerr)

    plt.plot(cvals, lamerrs, label='Cross Validation Error')
    plt.plot(cvals, lamclasserr, label='Test Error')
    plt.plot(cvals, lamtesterr, label='Training Error')
    plt.xlabel("C Used")
    plt.ylabel("Percent Error")
    plt.xscale("log")
    plt.title("Error vs C Value Used")
    plt.legend()
    plt.show()
    return c_choice
Exemple #2
0
def linearSVMPlot():
    lamerrs = []
    lamclasserr = []
    lamtesterr = []
    for i in range(0, len(cval_1a)):
        print(i)
        svm = SVM(None, cval_1a[i])
        svm.fit(X_train, y_train)
        testpred = svm.predict(X_test)
        trainpred = svm.predict(X_train)
        class_error = utils.classification_error(testpred, y_test)
        training_error = utils.classification_error(trainpred, y_train)
        lamclasserr.append(class_error)
        lamtesterr.append(training_error)
        lerr = lambdaError(None, cval_1a[i], folds_1a)
    #     print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i])
        lamerrs.append(lerr)

    plt.plot(cval_1a, lamerrs, label='Cross Validation Error')
    plt.plot(cval_1a, lamclasserr, label='Test Error')
    plt.plot(cval_1a, lamtesterr, label='Training Error')
    print(lamtesterr)
    print(lamclasserr)
    print(lamerrs)
    plt.xlabel("C Used")
    plt.ylabel("Percent Error")
    plt.xscale("log")
    plt.title("Error vs C Value Used")
    plt.legend()
    plt.show()
Exemple #3
0
def plotPolyOptimal(cvals, qvals):
    lamerrs = []
    lamclasserr = []
    lamtesterr = []
    
    for i in range(0, len(qvals)):
        print(i)
        svm = SVM(lambda x1, x2: (np.dot(x1, x2) + 1)**qvals[i], cvals[i])
        svm.fit(X_train, y_train)
        testpred = svm.predict(X_test)
        trainpred = svm.predict(X_train)
        class_error = utils.classification_error(testpred, y_test)
        training_error = utils.classification_error(trainpred, y_train)
        lamclasserr.append(class_error)
        lamtesterr.append(training_error)
        lerr = lambdaError(lambda x1, x2: (np.dot(x1, x2) + 1)**qvals[i], cvals[i], folds_1a)
        lamerrs.append(lerr)
    print(lamtesterr)
    print(lamclasserr)
    print(lamerrs)
    plt.plot(qvals, lamerrs, label='Cross Validation Error')
    plt.plot(qvals, lamclasserr, label='Test Error')
    plt.plot(qvals, lamtesterr, label='Training Error')
    plt.xlabel("Q Used")
    plt.ylabel("Percent Error")
    plt.title("Error vs Q Value Used")
    plt.legend()
    plt.show()
Exemple #4
0
def plotRBF(cvals, yvals):
    lamerrs = []
    lamclasserr = []
    lamtesterr = []
    
    rbf_eq = lambda x1, x2: math.exp((x1 - x2).dot(x1 - x2) * -yvals[i])
    for i in range(0, len(yvals)):
        print(i)
        svm = SVM(rbf_eq, cvals[i])
        svm.fit(X_train, y_train)
        testpred = svm.predict(X_test)
        trainpred = svm.predict(X_train)
        class_error = utils.classification_error(testpred, y_test)
        training_error = utils.classification_error(trainpred, y_train)
        lamclasserr.append(class_error)
        lamtesterr.append(training_error)
        lerr = lambdaError(rbf_eq, cvals[i], folds_1a)
        lamerrs.append(lerr)
    print(lamtesterr)
    print(lamclasserr)
    print(lamerrs)

    plt.plot(yvals, lamerrs, label='Cross Validation Error')
    plt.plot(yvals, lamclasserr, label='Test Error')
    plt.plot(yvals, lamtesterr, label='Training Error')
    plt.xlabel("y Used")
    plt.xscale("log")
    plt.ylabel("Percent Error")
    plt.title("Error vs y Value Used")
    plt.legend()
    plt.show()
Exemple #5
0
def tune(name):
    print(name, 'tuning')
    data = u.get_data(name)
    max_nodes = data.shape[1] - 1
    min_nodes = int(max_nodes / 2)
    for i in range(0, 3):
        for j in range(max_nodes + 1):
            # data setup
            test_data = data.sample(45)
            sets = u.split_to_train_test_sets(test_data)
            training_set = sets['Training_Set']
            test_set = sets['Test_Set']
            error = 'oops'
            try:
                # training and testing
                model = BackProp.build_network(training_set, i, j, 0.25)
                classified = BackProp.classify(model, test_set)
                error = u.classification_error(classified)
                error = round(error, 4) * 100
            except:
                pass
            # record to file
            row = [i, j, error]
            with open(name + '_tuning.csv', 'a', newline='') as file:
                writer = csv.writer(file)
                writer.writerow(row)
            file.close()
Exemple #6
0
    def fit(self):

        (self.w, self.alpha, f,
         _) = minimizers.findMin(self.funObj, self.w, self.alpha,
                                 self.maxEvals, self.verbose, self.X, self.y)

        print("Training error: %.3f" %
              utils.classification_error(self.predict(self.X), self.y))
Exemple #7
0
def lambdaError(kernel, cval, folds):
    average = 0
    failedTrains = 0
    svm = SVM(kernel, cval)
    for i in range(0, 5):
        leave_out_data, training_data = utils.partition_cross_validation_fold(folds, i)
        status = svm.fit(training_data[0], training_data[1])
        reg_pred = svm.predict(leave_out_data[0])
        reg_err = utils.classification_error(reg_pred, leave_out_data[1])
        average = average + reg_err
    average = average / (5 - failedTrains)
    return average
Exemple #8
0
def lambdaError(lam, folds):
    average = 0
    logreg = LogisticRegression(lam)
    for i in range(0, 5):
        leave_out_data, training_data = utils.partition_cross_validation_fold(
            folds, i)
        logreg.fit(training_data[0], training_data[1])
        reg_pred = logreg.predict(leave_out_data[0])
        reg_err = utils.classification_error(reg_pred, leave_out_data[1])
        average = average + reg_err
    average = average / 5
    return average
Exemple #9
0
def plotNeuralNetworks(mode, step_sz):
    lamerrs = []
    lamclasserr = []
    lamtesterr = []
    for i in range(0, len(dvals)):
        print("Starting CV ", i)
        path = "P3/" + folder + "/InitParams/sigmoid/" + str(dvals[i])
        initial_params = utils.load_initial_weights(path)
        nn = NeuralNetworkClassification(d,
                                         num_hidden=dvals[i],
                                         activation=mode,
                                         W1=initial_params["W1"],
                                         W2=initial_params["W2"],
                                         b1=initial_params["b1"],
                                         b2=initial_params["b2"])
        nn.fit(X_train, y_train, step_size=step_sz)
        testpred = nn.predict(X_test)
        trainpred = nn.predict(X_train)
        class_error = utils.classification_error(testpred, y_test)
        training_error = utils.classification_error(trainpred, y_train)
        print(class_error)
        print(training_error)
        lamclasserr.append(class_error)
        lamtesterr.append(training_error)
        lerr = cvError(dvals[i], folds, mode, step_sz)
        #     print("Training Error: ", training_error, "|Test Error: ", class_error, "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i])
        lamerrs.append(lerr)

    plt.plot(dvals, lamerrs, label='Cross Validation Error')
    plt.plot(dvals, lamclasserr, label='Test Error')
    plt.plot(dvals, lamtesterr, label='Training Error')
    print(lamtesterr)
    print(lamclasserr)
    print(lamerrs)
    plt.xlabel("C Used")
    plt.ylabel("Percent Error")
    plt.title("Error vs D Value Used")
    plt.legend()
    plt.show()
Exemple #10
0
def five_fold_validation(data_set, data_name, eta=None, demo = False):
    # split the datasets into fifths
    splits = u.five_fold_split(data_set)
    errors = []
    export = True
    # for each fifth of the dataset
    for split in splits:
        test_set = None
        training_set = pd.DataFrame(columns=data_set.columns.values)
        # check each fifth
        for s in splits:   
            # if fifth in question
            if s == split:
                # this fifth is test set
                test_set = splits[s]
            # all others are training sets
            else:
                training_set = training_set.append(splits[s], sort=False)
        # only export and demonstrate one of the folds
        if split != 1:
            export = False
        else:
            export = True 
        
        # if eta is supplied, perform Linear Regression
        if eta:
            model = Logistic_Regression.learn_models(training_set, eta, data_name, export=export)
            Logistic_Regression.classify(test_set, model)
        # of no eta is supplied, perform Naive Bayes
        else:
            model = Naive_Bayes.learn(training_set, data_name, export=export)
            Naive_Bayes.classify(test_set, model)

        # find and append the classification error
        err = u.classification_error(test_set)
        errors.append(err)
        
        # print results of first split
        if demo:
            print('Sample Training Data\n', training_set.head())
            print('\nWeight Vectors')
            for m in model:
               print(m, model[m])
            print('\nClassified Test Set\n',test_set)
            break
        # remove Guess column to prevent errors in future fold tests
        test_set.drop(['Guess'], axis=1, inplace=True)
    # retrn average error
    return sum(errors)/len(errors)
    def fit(self, X, y):
        n, d = X.shape
        minimize = lambda ind: findMin.findMin(self.funObj,
                                               np.zeros(len(ind)),
                                               self.maxEvals,
                                               X[:, ind],
                                               y,
                                               verbose=0)
        selected = set()
        selected.add(0)
        minLoss = np.inf
        oldLoss = 0
        bestFeature = -1

        while minLoss != oldLoss:
            oldLoss = minLoss
            print("Epoch %d " % len(selected))
            print("Selected feature: %d" % (bestFeature))
            print("Min Loss: %.3f\n" % minLoss)

            for i in range(d):
                if i in selected:
                    continue

                # Fit the model with 'i' added to the features,
                selected_new = selected | {
                    i
                }  # tentatively add feature "i" to the seected set
                self.w = np.zeros(d)
                self.w[list(selected_new)], _ = minimize(list(selected_new))

                # then compute the loss and update the minLoss/bestFeature
                loss = utils.classification_error(self.predict(X), y)
                if loss < minLoss:
                    minLoss = loss
                    bestFeature = i

            selected.add(bestFeature)

        self.w = np.zeros(d)
        self.w[list(selected)], _ = minimize(list(selected))
Exemple #12
0
def cvError(dval, folds, mode, step):
    average = 0
    path = "P3/" + folder + "/InitParams/sigmoid/" + str(dval)
    initial_params = utils.load_initial_weights(path)

    for i in range(0, 5):
        nn = NeuralNetworkClassification(d,
                                         num_hidden=dval,
                                         activation=mode,
                                         W1=initial_params["W1"],
                                         W2=initial_params["W2"],
                                         b1=initial_params["b1"],
                                         b2=initial_params["b2"])
        leave_out_data, training_data = utils.partition_cross_validation_fold(
            folds, i)
        nn.fit(training_data[0], training_data[1], step_size=step)
        reg_pred = nn.predict(leave_out_data[0])
        reg_err = utils.classification_error(reg_pred, leave_out_data[1])
        average = average + reg_err
    average = average / 5
    return average
Exemple #13
0
def five_fold_validation(data_set, data_name, n_layers, n_neurons, demo=False):
    # split the datasets into fifths
    splits = u.five_fold_split(data_set)
    errors = []
    export = True
    # for each fifth of the dataset
    for split in splits:
        test_set = None
        training_set = pd.DataFrame(columns=data_set.columns.values)
        # check each fifth
        for s in splits:
            # if fifth in question
            if s == split:
                # this fifth is test set
                test_set = splits[s]
            # all others are training sets
            else:
                training_set = training_set.append(splits[s], sort=False)

        # train network
        model = BackProp.build_network(training_set, n_layers, n_neurons, 0.25)
        # classify test set
        classified = BackProp.classify(model, test_set)

        # find and append the classification error
        err = u.classification_error(classified)
        errors.append(err)

        # print results of first split
        if demo:
            print('Sample Training Data\n', training_set.head())
            print('\nNetwork')
            print(model[0])
            print('\nClassified Test Set\n', test_set)
            break
        # remove Guess column to prevent errors in future fold tests
        test_set.drop(['Guess'], axis=1, inplace=True)
    # retrn average error
    return sum(errors) / len(errors)
Exemple #14
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-q', '--question', required=True)
    io_args = parser.parse_args()
    question = io_args.question

    if question == "2":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logReg(maxEvals=400)
        model.fit(XBin, yBin)

        print("\nlogReg Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logReg Validation error %.3f" %
              utils.classification_error(model.predict(XBinValid), yBinValid))
        print("# nonZeros: %d" % (model.w != 0).sum())

    elif question == "2.1":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logRegL2(lammy=1.0, maxEvals=400)
        model.fit(XBin, yBin)

        print("\nlogRegL2 Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logRegL2 Validation error %.3f" %
Exemple #15
0
        # part 1: implement knn.predict
        # part 2: print training and test errors for k=1,3,10 (use utils.classification_error)
        # part 3: plot classification boundaries for k=1 (use utils.plot_2dclassifier)
        model = None
        predict = None
        yhat = None
        Yhat = None
        tr_err = 0
        te_err = 0

        for k in [1, 3, 10]:
            model = knn.fit(X, y, k)
            predict = model['predict']
            yhat = predict(model, X)
            Yhat = predict(model, Xtest)
            tr_err = utils.classification_error(y, yhat)
            te_err = utils.classification_error(ytest, Yhat)
            print("Training error for k =", k, "is =", tr_err)
            print("Testing error for k =", k, "is =", te_err)

        utils.plot_2dclassifier(knn.fit(X, y, 1), Xtest, ytest)
        plt.show()

    if question == '1.2':
        dataset = utils.load_dataset('citiesBig1')
        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']

        # part 1: implement cnn.py
Exemple #16
0
def main():
    X = pd.read_csv(
        '../data/BlackFriday.csv'
    )  # names =("User_ID", "Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status,", "Product_Category_1","Product_Category_2","Product_Category_3", "Purchase" ))
    N, d = X.shape
    print(N, d)
    # fill missing values with 0
    # (?) need to calculate percentage of missing value?
    X = X.fillna(0)
    # change gender to 0 and 1
    X['Gender'] = X['Gender'].apply(change_gender)
    # change age to 0 to 6
    X['Age'] = X['Age'].apply(change_age)
    # change city categories to 0 to 2
    X['City_Category'] = X['City_Category'].apply(change_city)
    # change the year to integer
    X['Stay_In_Current_City_Years'] = X['Stay_In_Current_City_Years'].apply(
        change_year)

    #predict gender
    y = np.zeros((N, 1))
    y = X.values[:, 2]
    y = y.astype('int')
    X1 = X
    ID = ['User_ID', 'Product_ID', 'Gender']
    X1 = X1.drop(ID, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X1,
                                                        y,
                                                        test_size=0.20,
                                                        random_state=42)
    model = LogisticRegression(C=1,
                               fit_intercept=False,
                               solver='lbfgs',
                               multi_class='multinomial')
    model.fit(X_train, y_train)
    print("LogisticRegression(softmax) Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("LogisticRegression(softmax) Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))

    model = linear_model.SGDClassifier(max_iter=1000, tol=1e-3)
    model.fit(X_train, y_train)

    print("logLinearClassifier Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("logLinearClassifier Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))

    #predict the product category1  based on other information.
    y2 = np.zeros((N, 1))
    y2 = X.values[:, 8]
    y2 = y2.astype('int')
    X2 = X
    ID = [
        'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2',
        'Product_Category_3'
    ]
    X2 = X2.drop(ID, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X2,
                                                        y2,
                                                        test_size=0.2,
                                                        random_state=42)

    model = KNeighborsClassifier(n_neighbors=5, metric='cosine')
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)
    tr_error = np.mean(y_pred != y_train)

    y_pred = model.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Training error of KNN to predict age: %.3f" % tr_error)
    print("Testing error of KNN to predict age: %.3f" % te_error)
    # Training error of KNN to predict age: 0.363
    #Testing error of KNN to predict age: 0.496

    # Use decision tree to predict
    e_depth = 20
    s_depth = 1

    train_errors = np.zeros(e_depth - s_depth)
    test_errors = np.zeros(e_depth - s_depth)

    for i, d in enumerate(range(s_depth, e_depth)):
        print("\nDepth: %d" % d)

        model = DecisionTreeClassifier(max_depth=d,
                                       criterion='entropy',
                                       random_state=1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_train)
        tr_error = np.mean(y_pred != y_train)

        y_pred = model.predict(X_test)
        te_error = np.mean(y_pred != y_test)
        print("Training error: %.3f" % tr_error)
        print("Testing error: %.3f" % te_error)

        train_errors[i] = tr_error
        test_errors[i] = te_error

    x_vals = np.arange(s_depth, e_depth)
    plt.title("The effect of tree depth on testing/training error")
    plt.plot(x_vals, train_errors, label="training error")
    plt.plot(x_vals, test_errors, label="testing error")
    plt.xlabel("Depth")
    plt.ylabel("Error")
    plt.legend()

    fname = os.path.join("..", "figs", "trainTest_category1.pdf")
    plt.savefig(fname)
    print("\nFigure saved as '%s'" % fname)

    model = RandomForestClassifier(criterion="entropy",
                                   n_estimators=5,
                                   max_features=5)
    model.fit(X_train, y_train)
    print("RandomForest Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("RandomForest Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))
    #RandomForest Training error 0.027
    #RandomForest Validation error 0.157
    tree = DecisionTreeClassifier(max_depth=13,
                                  criterion='entropy',
                                  random_state=1)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_train)
    tr_error = np.mean(y_pred != y_train)

    y_pred = tree.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Decision Tree Training error : %.3f" % tr_error)
    print("Decision Tree Validation error: %.3f" % te_error)
    #Depth: 11
    #Training error: 0.127
    #Testing error: 0.131

    #use softmaxClassifier to predict occputation
    model = LogisticRegression(C=10000,
                               fit_intercept=False,
                               solver='lbfgs',
                               multi_class='multinomial')
    model.fit(X_train, y_train)
    print("LogisticRegression(softmax) Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("LogisticRegression(softmax) Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))
    #LogisticRegression(softmax) Training error 0.651
    #LogisticRegression(softmax) Validation error 0.652

    from sklearn.preprocessing import PolynomialFeatures
    from sklearn.linear_model import LinearRegression
    from sklearn.gaussian_process.kernels import ConstantKernel, RBF
    from sklearn.kernel_ridge import KernelRidge
    from sklearn.gaussian_process import GaussianProcessRegressor
    from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
    from sklearn.metrics import mean_squared_error
    poly = PolynomialFeatures(degree=4)
    X_train_sub = X_train[:1000]
    y_train_sub = y_train[:1000]
    X_train_ = poly.fit_transform(X_train_sub)
    model = LinearRegression()
    model.fit(X_train_, y_train_sub)
    model.score(X_train_, y_train_sub, sample_weight=None)
    y_pred = model.predict(X_train_)
    tr_error = mean_squared_error(y_pred, y_train_sub)

    y_pred = model.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Training error : %.3f" % tr_error)
    print("Validation error: %.3f" % te_error)

    #kernel = DotProduct() + WhiteKernel()
    y2 = np.zeros((N, 1))
    y2 = X.values[:, 8]
    y2 = y2.astype('int')
    X2 = X
    ID = [
        'User_ID', 'Product_ID', 'Product_Category_1', 'Product_Category_2',
        'Product_Category_3'
    ]
    X2 = X2.drop(ID, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X2,
                                                        y2,
                                                        test_size=0.02,
                                                        random_state=42)
    gpr = GaussianProcessRegressor(kernel=None,
                                   random_state=0).fit(X_train, y_train)
    gpr.score(X_train, y_train)
    y_pred = gpr.predict(X_train)
    tr_error = mean_squared_error(y_pred, y_train)
    y_pred = gpr.predict(X_test)
    te_error = mean_squared_error(y_pred, y_test)
    clf = KernelRidge(alpha=0.5)
    clf.fit(X_train_sub, y_train_sub)
    clf.score(X_train_sub, y_train_sub, sample_weight=None)
def main():

    X = pd.read_csv(
        '../data/BlackFriday.csv'
    )  # names =("User_ID", "Product_ID", "Gender", "Age", "Occupation", "City_Category", "Stay_In_Current_City_Years", "Marital_Status,", "Product_Category_1","Product_Category_2","Product_Category_3", "Purchase" ))
    N, d = X.shape
    X.info()
    X.sort_values('User_ID').head(10)
    X['User_ID'].value_counts().count()  #5,891 customers
    # fill missing values with 0
    # (?) need to calculate percentage of missing value?
    X = X.fillna(0)
    # change gender to 0 and 1
    X['Gender'] = X['Gender'].apply(change_gender)
    # change age to 0 to 6
    X['Age'] = X['Age'].apply(change_age)
    # change city categories to 0 to 2
    X['City_Category'] = X['City_Category'].apply(change_city)
    # change the year to integer
    X['Stay_In_Current_City_Years'] = X['Stay_In_Current_City_Years'].apply(
        change_year)

    #predict age
    # Make y matrix to be the age
    y = np.zeros((N, 1))
    y = X.values[:, 3]
    y = y.astype('int')

    # X_no_age matrix deletes the Age column in the original dataset
    X_no_age = X
    ID = ['User_ID', 'Product_ID', 'Age']
    X_no_age = X_no_age.drop(ID, axis=1)
    #print(X.shape)

    # split the data into training and test set using sklearn build-in function
    # the test_size = 0.2
    # number of test examples = 107516
    # number of training examples = 430061
    X_train, X_test, y_train, y_test = train_test_split(X_no_age,
                                                        y,
                                                        test_size=0.2)

    #  model = KNeighborsClassifier(n_neighbors=5, metric = 'cosine')
    #  model.fit(X_train, y_train)

    #   y_pred = model.predict(X_train)
    #   tr_error = np.mean(y_pred != y_train)

    #   y_pred = model.predict(X_test)
    #   te_error = np.mean(y_pred != y_test)
    #   print("Training error to predict age: %.3f" % tr_error)
    #   print("Testing error to predict age: %.3f" % te_error)

    e_depth = 20
    s_depth = 1

    train_errors = np.zeros(e_depth - s_depth)
    test_errors = np.zeros(e_depth - s_depth)

    for i, d in enumerate(range(s_depth, e_depth)):
        print("\nDepth: %d" % d)

        model = DecisionTreeClassifier(max_depth=d,
                                       criterion='entropy',
                                       random_state=1)
        model.fit(X_train, y_train)

        y_pred = model.predict(X_train)
        tr_error = np.mean(y_pred != y_train)

        y_pred = model.predict(X_test)
        te_error = np.mean(y_pred != y_test)
        print("Training error: %.3f" % tr_error)
        print("Testing error: %.3f" % te_error)

        train_errors[i] = tr_error
        test_errors[i] = te_error

    x_vals = np.arange(s_depth, e_depth)
    plt.title("The effect of tree depth on testing/training error")
    plt.plot(x_vals, train_errors, label="training error")
    plt.plot(x_vals, test_errors, label="testing error")
    plt.xlabel("Depth")
    plt.ylabel("Error")
    plt.legend()

    fname = os.path.join("..", "figs", "trainTest_age.pdf")
    plt.savefig(fname)
    print("\nFigure saved as '%s'" % fname)

    #use decision tree model to predict age
    tree = DecisionTreeClassifier(max_depth=13,
                                  criterion='entropy',
                                  random_state=1)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_train)
    tr_error = np.mean(y_pred != y_train)
    #Depth: 13
    #Training error: 0.352
    #Testing error: 0.373
    y_pred = tree.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Training error of predicting occupation: %.3f" % tr_error)
    print("Testing error: %.3f" % te_error)

    #use RandomForestClassifier
    model = RandomForestClassifier(criterion="entropy",
                                   n_estimators=10,
                                   max_features=None)
    model.fit(X_train, y_train)
    print("RandomForest Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("RandomForest Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))

    #use softmaxClassifier to predict occputation
    model = LogisticRegression(C=1,
                               fit_intercept=False,
                               solver='lbfgs',
                               multi_class='multinomial')
    model.fit(X_train, y_train)
    print("LogisticRegression(softmax) Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("LogisticRegression(softmax) Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))
    # result:
    # k=10: Training error: 0.526 Testing error: 0.630
    # k=3: Training error: 0.405 Testing error: 0.669
    # k=5: Training error: 0.462 Testing error: 0.650

    #----------------------------------------------------------------------------------------------------
    #to predict the occupation
    # Make y matrix to be the occupation
    y_occ = X.values[:, 4]
    y_occ = y_occ.astype('int')
    X_occ = X
    ID = [
        'User_ID', 'Product_ID', 'Occupation', 'Product_Category_1',
        'Product_Category_2', 'Product_Category_3'
    ]
    X_occ.drop(ID, inplace=True, axis=1)
    X_train, X_test, y_train, y_test = train_test_split(X_occ,
                                                        y_occ,
                                                        test_size=0.2)

    model = KNeighborsClassifier(n_neighbors=5)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_train)
    tr_error = np.mean(y_pred != y_train)

    y_pred = model.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Training error of predicting occupation: %.3f" % tr_error)
    print("Testing error of predicting occupation: %.3f" % te_error)

    #use decision tree model
    tree = DecisionTreeClassifier(max_depth=18,
                                  criterion='entropy',
                                  random_state=1)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_train)
    tr_error = np.mean(y_pred != y_train)

    y_pred = tree.predict(X_test)
    te_error = np.mean(y_pred != y_test)
    print("Training error of predicting occupation: %.3f" % tr_error)
    print("Testing error: %.3f" % te_error)

    #use softmaxClassifier to predict occputation
    model = LogisticRegression(C=10000,
                               fit_intercept=False,
                               solver='lbfgs',
                               multi_class='multinomial')
    model.fit(X_train, y_train)
    print("LogisticRegression(softmax) Training error %.3f" %
          utils.classification_error(model.predict(X_train), y_train))
    print("LogisticRegression(softmax) Validation error %.3f" %
          utils.classification_error(model.predict(X_test), y_test))

    # use isomap to visualize the data
    from sklearn.manifold import Isomap
    model = Isomap(n_components=2)
    ID = [
        'User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3',
        'Purchase'
    ]
    X_1 = X
    X_1 = X_1.drop(ID, axis=1)
    fig, ax = plt.subplots()
    Z = model.fit_transform(X_1[:10000])
    ax.scatter(Z[:, 0], Z[:, 1])
    plt.ylabel('z2')
    plt.xlabel('z1')
    plt.title('ISOMAP with 2components')
    fname = os.path.join("..", "figs", "ISOMAP_with_2_components.png")
    plt.savefig(fname)

    model = DBSCAN(eps=1, min_samples=3)
    y = model.fit_predict(Z)
    plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet", s=5)

    # clustering the 2 dimensional plot
    model = KMeans(n_clusters=5, random_state=0)
    model.fit(Z)
    y = model.predict(Z)
    plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet")
    plt.ylabel('z2')
    plt.xlabel('z1')
    plt.title('ISOMAP with k_means of 5 clusters')
    plt.show()
    fname = os.path.join("..", "figs", "kmeans.png")
    plt.savefig(fname)
    #compress in 3 dimension
    n_compoents = 3
    model = Isomap(n_components=3)
    Z = model.fit_transform(X_1[:5000])
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(Z[:, 0], Z[:, 1], Z[:, 2], c='b')
    ax.set_zlabel('z3')
    ax.set_ylabel('z2')
    ax.set_xlabel('z1')
    plt.title('ISOMAP with 3')
    fname = os.path.join("..", "figs", "ISOMAP_with_3_components.png")
    plt.savefig(fname)

    #use PCA to study the data
    ID = ['User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3']
    X_1 = X
    X_1 = X_1.drop(ID, axis=1)
    model = PCA(n_components=3, svd_solver='auto')
    Z = model.fit_transform(X_1[:10000])
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.scatter(Z[:, 0], Z[:, 1], Z[:, 2], c='r')
    ax.set_zlabel('z3')
    ax.set_ylabel('z2')
    ax.set_xlabel('z1')
    plt.title('PCA with 3 components')
    plt.show()
    print(model.explained_variance_ratio_)
    fname = os.path.join("..", "figs", "PCA.png")
    plt.savefig(fname)
    #use pca to study the data 2 componetns
    ID = ['User_ID', 'Product_ID', 'Product_Category_2', 'Product_Category_3']
    X_1 = X
    X_1 = X_1.drop(ID, axis=1)
    model = PCA(n_components=2, svd_solver='auto')
    Z = model.fit_transform(X_1[:100000])
    fig = plt.figure()
    plt.title('PCA with 2 components')
    plt.scatter(Z[:, 0], Z[:, 1], c='r', cmap="jet", s=5)
    plt.ylabel('z2')
    plt.xlabel('z1')
    fname = os.path.join("..", "figs", "PCA_with_2_components.png")
    print(model.explained_variance_ratio_)
    plt.savefig(fname)
    #clustering
    ID = ['User_ID', 'Product_ID']
    X_1 = X
    X_1 = X_1.drop(ID, axis=1)
    model = PCA(n_components=2, svd_solver='auto')
    Z = model.fit_transform(X_1[:1000])
    model = DBSCAN(eps=1, min_samples=3)
    y = model.fit_predict(Z)
    plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet", s=5)
    plt.ylabel('z2')
    plt.xlabel('z1')
    fname = os.path.join("..", "figs", "clustering_from_PCA.png")
    plt.savefig(fname)
    model = KMeans(n_clusters=4, random_state=0)
    model.fit(Z)
    y = model.predict(Z)
    plt.scatter(Z[:, 0], Z[:, 1], c=y, cmap="jet")
    plt.ylabel('z2')
    plt.xlabel('z1')
    plt.title('PCA with NN=%d with k_means from 2 components')
    plt.show()
Exemple #18
0
        y_int = np.int32(y)

        lams = [2, 1.75, 1.5, 1.25, 1]
        test_error = []
        train_error = []

        train_bias = np.ones((X.shape[0], 1))
        test_bias = np.ones((Xtest.shape[0], 1))
        X = np.hstack((train_bias, X))
        Xtest = np.hstack((test_bias, Xtest))

        for lammy in lams:
            model = linear_model.softmaxClassifier(lammy=lammy, epochs=10, alpha=1, batch=5000)
            model.fit(X, y, Y)
            pred = model.predict(Xtest)
            e = utils.classification_error(ytest, pred)
            print("at lambda ", lammy, "validation error is ", e)
            test_error = np.append(test_error, e)
            pred = model.predict(X)
            e = utils.classification_error(y, pred)
            print("at lambda ", lammy, "train error is ", e)
            train_error = np.append(train_error, e)

        plt.plot(lams, test_error, label="validation error")
        plt.plot(lams, train_error, label="training error")
        plt.title("Multi-Class Linear Classifier")
        plt.xlabel("Lambda")
        plt.ylabel("Error")
        fname = os.path.join("..", "figs", "linear.pdf")
        plt.savefig(fname)
        print("\nFigure saved as '%s" % fname)
Exemple #19
0
    parser = argparse.ArgumentParser()
    parser.add_argument("-q", "--question", required=True)
    io_args = parser.parse_args()
    question = io_args.question

    if question == "2":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data["X"], data["y"]
        XBinValid, yBinValid = data["Xvalid"], data["yvalid"]

        model = linear_model.logReg(maxEvals=400)
        model.fit(XBin, yBin)

        print(
            "\nlogReg Training error %.3f"
            % utils.classification_error(model.predict(XBin), yBin)
        )
        print(
            "logReg Validation error %.3f"
            % utils.classification_error(model.predict(XBinValid), yBinValid)
        )
        print("# nonZeros: %d" % (model.w != 0).sum())

    elif question == "2.1":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data["X"], data["y"]
        XBinValid, yBinValid = data["Xvalid"], data["yvalid"]

        model = linear_model.logRegL2(lammy=1.0, maxEvals=400)
        model.fit(XBin, yBin)
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-q', '--question', required=True)
    io_args = parser.parse_args()
    question = io_args.question

    if question == "2":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logReg(maxEvals=400)
        model.fit(XBin, yBin)

        print("\nlogReg Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logReg Validation error %.3f" %
              utils.classification_error(model.predict(XBinValid), yBinValid))
        print("# nonZeros: %d" % (model.w != 0).sum())

    elif question == "2.1":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logRegL2(maxEvals=400, l=1.0)
        model.fit(XBin, yBin)

        print("\nlogRegL2 Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logRegL2 Validation error %.3f" %
Exemple #21
0
    io_args = parser.parse_args()
    question = io_args.question

    if question == '1.1':
        dataset = utils.load_dataset('citiesSmall')
        X = dataset['X']
        y = dataset['y']
        Xtest = dataset['Xtest']
        ytest = dataset['ytest']
        #model = knn.fit(X,y,3)
        #model = knn.fit(X,y,1)
        model = knn.fit(X, y, 10)

        y_pred_tr = knn.predict(model, X)
        y_pred_te = knn.predict(model, Xtest)
        trerror = utils.classification_error(y_pred_tr, y)
        teerror = utils.classification_error(y_pred_te, ytest)

        print(trerror)
        print(teerror)

        utils.plot_2dclassifier(model, Xtest, ytest)

        # part 1: implement knn.predict
        # part 2: print training and test errors for k=1,3,10 (use utils.classification_error)
        # part 3: plot classification boundaries for k=1 (use utils.plot_2dclassifier)

    if question == '1.2':
        dataset = utils.load_dataset('citiesBig1')
        X = dataset['X']
        y = dataset['y']
Exemple #22
0
        reg_err = utils.classification_error(reg_pred, leave_out_data[1])
        average = average + reg_err
    average = average / 5
    return average


lamerrs = []
lamclasserr = []
lamtesterr = []

for i in range(0, len(alllam)):
    logreg = LogisticRegression(alllam[i])
    logreg.fit(X_train, y_train)
    testpred = logreg.predict(X_test)
    trainpred = logreg.predict(X_train)
    class_error = utils.classification_error(testpred, y_test)
    training_error = utils.classification_error(trainpred, y_train)
    lamclasserr.append(class_error)
    lamtesterr.append(training_error)
    lerr = lambdaError(alllam[i], folds)
    print("Training Error: ", training_error, "|Test Error: ", class_error,
          "|Cross Validation Error: ", lerr, "|Lambda: ", alllam[i])
    lamerrs.append(lerr)

print(lamerrs)

training_cumulative = []
class_cumulative = []
datasize = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for i in range(0, len(x_data)):
    logistic.fit(x_data[i], y_data[i])
Exemple #23
0
if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('-q', '--question', required=True)
    io_args = parser.parse_args()
    question = io_args.question

    if question == "2":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logReg(maxEvals=400)
        model.fit(XBin, yBin)

        print("\nlogReg Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logReg Validation error %.3f" %
              utils.classification_error(model.predict(XBinValid), yBinValid))
        print("# nonZeros: %d" % (model.w != 0).sum())

    elif question == "2.1":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logRegL2(lammy=1.0, maxEvals=400)
        model.fit(XBin, yBin)

        print("\nlogRegL2 Training error %.3f" %
              utils.classification_error(model.predict(XBin), yBin))
        print("logRegL2 Validation error %.3f" %
Exemple #24
0
    parser = argparse.ArgumentParser()
    parser.add_argument('-q','--question', required = True)
    io_args = parser.parse_args()
    question = io_args.question


    if question == "2":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        model = linear_model.logReg(maxEvals=400, verbose=1)
        model.fit(XBin,yBin)

        print("\nlogReg Training error %.3f" % utils.classification_error(model.predict(XBin), yBin))
        print("logReg Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid))
        print("# nonZeros: %d" % (model.w != 0).sum())

    elif question == "2.1":
        data = utils.load_dataset("logisticData")
        XBin, yBin = data['X'], data['y']
        XBinValid, yBinValid = data['Xvalid'], data['yvalid']

        # Fit logRegL2 model
        model = linear_model.logRegL2(lammy=1.0, maxEvals=400, verbose=1)
        model.fit(XBin,yBin)

        print("\nlogRegL2 Training error %.3f" % utils.classification_error(model.predict(XBin), yBin))
        print("logRegL2 Validation error %.3f" % utils.classification_error(model.predict(XBinValid), yBinValid))
        print("# nonZeros: %d" % (model.w != 0).sum())
        best_batch = 0
        best_alpha = 0

        for m in range(3):
            for a in range(3):
                val_error = []
                # cross validation
                for train, validate in kf.split(X, y):
                    model.fit(X[train],
                              y[train],
                              epoch=100,
                              minibatch=minibatch[m],
                              alpha=alpha[a])

                    # record validation error
                    v_error = utils.classification_error(
                        model.predict(X[validate]), y[validate])
                    val_error.append(v_error)

                avg_val_error = np.average(np.asarray(val_error))
                print("batch size: {0}, alpha: {1}, validation error: {2}".
                      format(minibatch[m], alpha[a], avg_val_error))

                if avg_val_error < min_val_error:
                    min_val_error = avg_val_error
                    best_batch = minibatch[m]
                    best_alpha = alpha[a]

        print("When batch size is {0}, alpha is {1}, test error is {2}".format(
            best_batch, best_alpha,
            utils.classification_error(model.predict(Xtest), ytest)))
Exemple #26
0
    ## LOCAL MODELS
    model1 = logistic_model.logRegL1(XBin[0:cut1,:], yBin[0:cut1], verbose=0, lammy=1, maxEvals=400)
    model2 = logistic_model.logRegL1(XBin[cut1+1:cut2,:], yBin[cut1+1:cut2], verbose=0, lammy=1, maxEvals=400)
    model3 = logistic_model.logRegL1(XBin[cut2+1:cut3,:], yBin[cut2+1:cut3], verbose=0, lammy=1, maxEvals=400)
    model4 = logistic_model.logRegL1(XBin[cut3+1:cut4,:], yBin[cut3+1:cut4], verbose=0, lammy=1, maxEvals=400)
    model5 = logistic_model.logRegL1(XBin[cut4+1:cut5,:], yBin[cut4+1:cut5], verbose=0, lammy=1, maxEvals=400)

    model1.fit()
    model2.fit()
    model3.fit()
    model4.fit()
    model5.fit()

    print("model1 Training error %.3f" % 
        utils.classification_error(model1.predict(XBin[0:cut1,:]), yBin[0:cut1]))
    print("model2 Training error %.3f" % 
        utils.classification_error(model2.predict(XBin[cut1+1:cut2,:]), yBin[cut1+1:cut2]))
    print("model3 Training error %.3f" % 
        utils.classification_error(model3.predict(XBin[cut2+1:cut3,:]), yBin[cut2+1:cut3]))
    print("model4 Training error %.3f" % 
        utils.classification_error(model4.predict(XBin[cut3+1:cut4,:]), yBin[cut3+1:cut4]))
    print("model5 Training error %.3f" % 
        utils.classification_error(model5.predict(XBin[cut4+1:cut5,:]), yBin[cut4+1:cut5]))

    print("model1 Validation error %.3f" % 
        utils.classification_error(model1.predict(XBinValid), yBinValid))
    print("model2 Validation error %.3f" % 
        utils.classification_error(model2.predict(XBinValid), yBinValid))
    print("model3 Validation error %.3f" % 
        utils.classification_error(model3.predict(XBinValid), yBinValid))
Exemple #27
0
    def predict(self, X):
        x = tf.placeholder(tf.float32, shape=X.shape)
        result = tf.matmul(x, self.w)
        prediction = tf.sign(result)
        result = self.session.run(prediction, feed_dict={x: X})
        return np.squeeze(np.where(result == -1, 0, 1))

    def compute_cost(self, X, y):
        result = tf.reduce_sum(tf.log(1 + tf.exp(- y * (X  @ self.w))))
        if self.loss == 'l2':
            return result + self.lammy * tf.reduce_sum(self.w ** 2) / 2
        else:
            return result + self.lammy * tf.reduce_sum(tf.abs(self.w))


if __name__ == '__main__':
    X_train, X_test, y_train, y_test = utils.preprocess_heart()

    with tf.Session() as sess:
        model = BinaryClassification(sess, verbose=1, loss='l2', learning_rate=0.00001, num_epochs=500)
        model.fit(X_train, y_train)
        pred = model.predict(X_test)

    print("The test error is: ", utils.classification_error(y_test, pred))

    model = svm.SVC()
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    print("The test error from sk-learn SVM is: ", utils.classification_error(y_test, pred))
    model4 = logistic_model.logRegL2(XBin[cut3 + 1:cut4, :],
                                     yBin[cut3 + 1:cut4],
                                     lammy=0.1,
                                     verbose=0,
                                     maxEvals=400)
    model4.fit()

    model5 = logistic_model.logRegL2(XBin[cut4 + 1:cut5, :],
                                     yBin[cut4 + 1:cut5],
                                     lammy=0.1,
                                     verbose=0,
                                     maxEvals=400)
    model5.fit()

    print("model1 Validation error %.3f" %
          utils.classification_error(model1.predict(XBinValid), yBinValid))
    print("model2 Validation error %.3f" %
          utils.classification_error(model2.predict(XBinValid), yBinValid))
    print("model3 Validation error %.3f" %
          utils.classification_error(model3.predict(XBinValid), yBinValid))
    print("model4 Validation error %.3f" %
          utils.classification_error(model4.predict(XBinValid), yBinValid))
    print("model5 Validation error %.3f" %
          utils.classification_error(model5.predict(XBinValid), yBinValid))

    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(XBin, yBin)
    print("sklearn sgd validation error %.3f" %
          utils.classification_error(clf.predict(XBinValid), yBinValid))

    svmclf = LinearSVC()