def main():
    # Load dataset
    data = datasets.load_iris()
    X = normalize(data.data[data.target != 0])
    y = data.target[data.target != 0]
    y[y == 1] = 0
    y[y == 2] = 1

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        seed=1)

    clf = LogisticRegression()
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    y_pred = np.reshape(y_pred, y_test.shape)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the results
    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Logistic Regression",
                      accuracy=accuracy,
                      legend_labels=data.target_names)
def digit_recognition():
    print('\nDigit recognition using Logistic Regression\n')
    print('Initiating Data Load...')
    digits = datasets.load_digits()
    X, y = digits.data, digits.target

    pca = PCA()
    X = pca.transform(X, num_components=23)
    y = one_hot_encode(y)

    size = len(X)
    indices = list(range(size))
    np.random.shuffle(indices)
    X, y = np.array([X[idx] for idx in indices
                     ]), np.array([y[idx] for idx in indices])

    train_size = int(0.8 * len(X))
    X_train, X_test, y_train, y_test = X[:train_size], X[
        train_size:], y[:train_size], y[train_size:]

    print('Constructing classifier...')
    size = (X_train.shape[-1], y_train.shape[-1])
    classifier = LogisticRegression(size)
    classifier.fit(X_train, y_train)

    print('Generating test predictions...')
    predictions = classifier.predict(X)

    accuracy = np.sum(
        [all(y_true == y_pred)
         for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100.
    print("Accuracy = {:.2f}%".format(accuracy))
Exemple #3
0
def main():
    # Load dataset
    data = datasets.load_iris()
    X = normalize(data.data[data.target != 0])
    y = data.target[data.target != 0]
    y[y == 1] = 0
    y[y == 2] = 1

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    clf1 = linear_model.LogisticRegression()
    clf1 = LogisticRegression()
    clf1.fit(X_train, y_train)
    y_pred = clf1.predict(X_test)
    y_pred = np.reshape(y_pred, y_test.shape)

    accuracy = accuracy_score(y_test, y_pred)
    print("sklearn lr Accuracy:", accuracy)

    clf2 = LogisticRegression()
    clf2.fit(X_train, y_train)
    y_pred = clf2.predict(X_test)
    y_pred = np.reshape(y_pred, y_test.shape)

    accuracy = accuracy_score(y_test, y_pred)
    print("Our lr Accuracy:", accuracy)
def iris_classification():
    print('\nIris classification using Logistic Regression\n')
    print('Initiating Data Load...')

    iris = datasets.load_iris()
    # X, y = iris.data, iris.target
    # y = one_hot_encode(y)

    X, y = iris.data[iris.target != 2], iris.target[iris.target != 2]
    y = y.reshape(y.shape[0], 1)

    size = len(X)
    indices = list(range(size))
    np.random.shuffle(indices)
    X, y = np.array([X[idx] for idx in indices
                     ]), np.array([y[idx] for idx in indices])

    train_size = int(0.8 * len(X))
    X_train, X_test, y_train, y_test = X[:train_size], X[
        train_size:], y[:train_size], y[train_size:]

    print('Data load complete!')

    print('Constructing classifier...')
    size = (X_train.shape[-1], y_train.shape[-1])
    classifier = LogisticRegression(size)
    classifier.fit(X_train, y_train)

    print('Generating test predictions...')
    predictions = classifier.predict(X)

    accuracy = np.sum(
        [all(y_true == y_pred)
         for y_true, y_pred in zip(y, predictions)]) / len(predictions) * 100.
    print("Accuracy = {:.2f}%".format(accuracy))
def test_logistic_regression():
    X = np.random.normal(size=(100, 2))
    y = np.where(X[:, 0] > 0.5, 1, 0).reshape(-1, 1)
    lr = LogisticRegression()
    lr.fit(X, y)
    pred = 1 if lr.predict(X)[-1] > 0.5 else 0
    assert pytest.approx(pred) == y[-1]
Exemple #6
0
def main():

    # Get training matrices for logistic regression model
    x, y = get_train_matrices()

    # Create instance of LogisticRegression with the training matrices
    logistic_regression = LogisticRegression(x, y)

    # Fit with learning rate, no of iterations and regularization(L2) parameter
    logistic_regression.fit(0.01, 100000, 0)

    # Print weights and biases and the plot and also print the performance estimators of the model
    print("So, the weights and biases become:\nWeights:\n {}\nBiases:\n {}"
          .format(logistic_regression.w, logistic_regression.c))

    # Validate the model by printing the performance metrics
    logistic_regression.validate()

    # Graph the curve of cost vs no of epochs
    logistic_regression.graph_cost_vs_epochs()

    # Predict for the input data in test folder and save as output.csv in test folder
    x_test = pd.read_csv('test/input.csv').values[:, 1:]
    y_test = logistic_regression.predict(x_test)
    df_predict = pd.DataFrame({'y': y_test.reshape(-1)})
    df_predict.to_csv('test/output.csv')
Exemple #7
0
def test_integ_fit():
    test_x = [np.array([[1, 2, 3], [2, 3, 4], [3, 4, 5]])]
    test_y = [np.array([1, 0, 1])]
    expected = [np.array([0.01328192, 0.06222676, 0.1111716])]
    lr_model = LogisticRegression()
    for idx in range(len(test_x)):
        lr_model.fit(test_x[idx], test_y[idx])
        assert pytest.approx(expected[idx], 1e-06) == lr_model.parameters
def train():
    fname = sys.argv[1]
    output_fname = sys.argv[2]

    X, y = get_data(data=read_train_csv(fname))
    model = LogisticRegression(iteration=30000)

    model.fit(X, y)

    model.save(output_fname)
Exemple #9
0
def runML(meth, itrs, data_train, data_test, labels_train, labels_test):
    print meth,datetime.now().time()
    model = LogisticRegression(method=meth,max_iters=itrs)
    model.fit(data_train, labels_train)
    print datetime.now().time()
    prediction = model.predict(data_test)
    tagscores = LogisticRegression.tagAccuracy(labels_test, prediction)
    score = np.mean(tagscores)
    print "  score tags: mean: {}, max: {}, min: {}".format(score,max(tagscores),min(tagscores))
    print "  error rate: {}".format(1 - score)
    print datetime.now().time()
Exemple #10
0
def standard_lr(x_train, y_train, x_valid, y_valid):
    from sklearn.linear_model import LogisticRegression
    lr = LogisticRegression(penalty='l2', max_iter=500, solver='sag', multi_class='ovr')
    lr.fit(x_train, y_train)
    pre = lr.predict(x_valid)

    correct = 0
    for i in range(len(y_valid)):
        if pre[i] == y_valid[i]:
            correct += 1
    print correct*1.0/len(y_valid)
Exemple #11
0
def lambdaError(lam, folds):
    average = 0
    logreg = LogisticRegression(lam)
    for i in range(0, 5):
        leave_out_data, training_data = utils.partition_cross_validation_fold(
            folds, i)
        logreg.fit(training_data[0], training_data[1])
        reg_pred = logreg.predict(leave_out_data[0])
        reg_err = utils.classification_error(reg_pred, leave_out_data[1])
        average = average + reg_err
    average = average / 5
    return average
Exemple #12
0
def repeat(train_X, train_Y, num, feature_list, intercept):
    for i in train_X.index.values:
        for col in train_X.columns:
            if pd.isna(train_X.loc[i][col]):
                train_X.loc[i, col] = train_X[col].mean()

    models = []
    for i in range(3):
        lr = LogisticRegression(fit_intercept=intercept)
        lr.fit(train_X[feature_list], train_Y.values[:, i])
        models.append(lr)
    model = models[num]
    return modelInfo(model, train_X, train_Y.values[:, num], feature_list, intercept)
Exemple #13
0
def p02cde(train_path, valid_path, test_path, pred_path):
    """Logistic regression with Newton's Method

    Args:
        train_path: Path to CSV file containing dataset for training.
        validation_path: Path to CSV file containing dataset for evaluation.
        test_path: Path to CSV file containing dataset for testing.
        pred_path: Path to save predictions.
    """
    pred_path_c = pred_path.replace(WILDCARD, "c")
    pred_path_d = pred_path.replace(WILDCARD, "d")
    pred_path_e = pred_path.replace(WILDCARD, "e")

    # Part (c)
    # Train classifier
    x_train, y_train = utils.load_dataset(train_path,
                                          label_col="t",
                                          add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    # Validate classifier
    x_test, y_test = utils.load_dataset(valid_path,
                                        label_col="t",
                                        add_intercept=True)
    t_pred = model.predict(x_test)
    utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_c))
    np.savetxt(pred_path_c, t_pred)

    # Part (d)
    x_train, y_train = utils.load_dataset(test_path,
                                          label_col="y",
                                          add_intercept=True)
    model = LogisticRegression()
    model.fit(x_train, y_train)
    # Validate classifier
    x_test, y_test = utils.load_dataset(test_path,
                                        label_col="t",
                                        add_intercept=True)
    y_pred = model.predict(x_test)
    utils.plot(x_test, y_test, model.theta, "{}.png".format(pred_path_d))
    np.savetxt(pred_path_d, y_pred)

    # Part (e) find corrections
    x_val, y_val = utils.load_dataset(valid_path,
                                      label_col="y",
                                      add_intercept=True)
    x_in_V = [x_train[i] for i in len(x_train) if y_train == 1]
    h = model.predict(x_in_V)
    alpha = np.mean(h)
Exemple #14
0
def test_fit_functional():
    import sklearn.model_selection
    import sklearn.datasets
    import numpy as np

    from logistic_regression import LogisticRegression, accuracy
    X = np.zeros((1000, 3), dtype=np.float32)
    X[:, -1] = 1
    features, targets = sklearn.datasets.make_blobs(1000,
                                                    2,
                                                    2,
                                                    cluster_std=1,
                                                    random_state=1234)
    X[:, [0, 1]] = features
    y = targets[:, np.newaxis]

    X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
        X, y)
    model = LogisticRegression(input_dimensions=2)
    train_xent, val_xent = model.fit(X_train,
                                     y_train,
                                     X_val,
                                     y_val,
                                     num_epochs=20,
                                     batch_size=4,
                                     alpha=0.1,
                                     _lambda=0.0)
    predictions = model.predict(X_val)
    assert accuracy(predictions, y_val) >= 0.65
    assert accuracy(predictions, y_val) >= 0.90
    assert accuracy(predictions, y_val) >= 0.99
Exemple #15
0
def test_fit_functional():
    import sklearn.model_selection
    import numpy as np

    from logistic_regression import LogisticRegression, accuracy
    X = np.zeros((900, 3), dtype=np.float32)
    num_samples = 30

    xx = np.linspace(-5, 5, num_samples)
    XX, YY = np.meshgrid(xx, xx)
    X[:, 0] = XX.flatten()
    X[:, 1] = YY.flatten()
    X[:, -1] = 1  # a column of 1's for the bias trick
    Z = 0.1 * XX + 0.2 * YY + 0.4
    y = Z.reshape(-1, 1)
    X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(
        X, y)
    model = LogisticRegression(input_dimensions=2)
    train_xent, val_xent = model.fit(X_train,
                                     y_train,
                                     X_val,
                                     y_val,
                                     num_epochs=20,
                                     batch_size=4,
                                     alpha=0.1,
                                     _lambda=0.0)
    predictions = model.predict(X_val)
    assert accuracy(predictions, y_val) >= 0.65
    assert accuracy(predictions, y_val) >= 0.90
    assert accuracy(predictions, y_val) >= 0.99
Exemple #16
0
def p01b(train_path, eval_path, pred_path):
    """Logistic regression with Newton's Method

    Args:
        train_path: Path to CSV file containing dataset for training.
        eval_path: Path to CSV file containing dataset for evaluation.
        pred_path: Path to save predictions.
    """
    # Train classifier
    x_train, y_train = utils.load_dataset(train_path, add_intercept=True)
    model = LogisticRegression(eps=1e-5)
    model.fit(x_train, y_train)

    # Validate classifier
    x_val, y_val = utils.load_dataset(eval_path, add_intercept=True)
    y_pred = model.predict(x_val)
    utils.plot(x_val, y_val, model.theta, "{}.png".format(pred_path))
    np.savetxt(pred_path, y_pred)
Exemple #17
0
def fitting():
    data = pd.read_csv('student_score.txt',
                       names=['Exam1', 'Exam2', 'admission'])
    x = data[['Exam1', 'Exam2']]
    y = data['admission']

    print(x.mean())
    print(x.max() - x.min())

    x = (x - x.mean()) / (x.max() - x.min())

    alpha = 10
    max_iter = 150
    model = LogisticRegression(alpha, max_iter)
    loss, _ = model.fit(x, y)

    p = model.predict(
        np.array([[
            1, (45.0 - 65.644274) / 69.769035, (85.0 - 66.221998) / 68.266173
        ]]), False)
    print('Predict %.3f when Exam1 euqals 45 and Exam2 equals 85' % p)

    plt.subplot(2, 1, 1)
    plt.plot(np.arange(1, max_iter + 1), loss)
    plt.title('Loss Curve')

    plt.subplot(2, 1, 2)
    negative = data[data['admission'] == 0]
    positive = data[data['admission'] == 1]
    plt.plot(negative['Exam1'], negative['Exam2'], 'yo')
    plt.plot(positive['Exam1'], positive['Exam2'], 'k+')

    print(model.w)

    bx = data['Exam1']
    by = (-68.266173 / model.w[2]) * ((
        (bx - 65.644274) / 69.769035) * model.w[1] + model.w[0]) + 66.221998

    x = data[['Exam1', 'Exam2']]
    x = (x - x.mean()) / (x.max() - x.min())

    p = [1 if i >= 0.5 else 0 for i in model.predict(x)]
    tp = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 1])
    tn = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 0])

    fp = sum([1.0 for vp, vy in zip(p, y) if vp == 1 and vy == 0])
    fn = sum([1.0 for vp, vy in zip(p, y) if vp == 0 and vy == 1])

    print(tp, tn, fp, fn)
    print('Accurancy %.2f' % ((tp + tn) / (tp + tn + fp + fn)))
    print('Precision %.2f' % (tp / (tp + fp)))
    print('Recall %.2f' % (tp / (tp + fn)))

    plt.plot(bx, by)

    plt.show()
def test_logistic_regression():
    iris = datasets.load_iris()
    X = iris.data[:, [2, 3]]
    y = iris.target
    X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                        test_size=0.3,
                                                        random_state=1,
                                                        stratify=y)
    X_train_01_subset = X_train[(y_train == 0) | (y_train == 1)]
    y_train_01_subset = y_train[(y_train == 0) | (y_train == 1)]
    lr = LogisticRegression(alpha=0.05, n_iter=1000, random_state=1)
    lr.fit(X_train_01_subset, y_train_01_subset)
    lr.plot_decision_regions(X=X_train_01_subset,
                             y=y_train_01_subset,
                             classifier=lr)
    plt.xlabel('Petal Length')
    plt.ylabel('Petal Width')
    plt.legend(loc='upper left')
    plt.show()
def test_passing():

    X, Y = datasets.make_classification(n_samples=100, random_state=42)

    X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
        X, Y, test_size=0.3, random_state=42)

    LR = LogisticRegression(20, seed=42)
    LR.fit(X_train, Y_train)

    Y_pred = LR.predict(X_test)
    Y_pred = (Y_pred > 0.5).clone().detach().type(torch.float32)

    Y_test = torch.tensor(Y_test, dtype=torch.float32)
    Y_test = torch.reshape(Y_test, (-1, 1))

    accuracy = 1 - torch.mean(torch.abs(Y_pred - Y_test)).item()

    assert abs(accuracy - 0.9666) < 0.01
def cross_validate_logistic(X, y, alpha, num_iterations):
    num_data_points = len(y)
    one_fifth = math.ceil(num_data_points / 5)
    initial_theta = []
    sum_accuracy = 0
    for i in range(len(X.columns)):
        initial_theta.append(0)
    for i in range(5):
        x_valid = X.iloc[one_fifth * i:one_fifth * (i + 1)]
        y_valid = y.iloc[one_fifth * i:one_fifth * (i + 1)]
        x_train = X.drop(X.index[one_fifth * i:one_fifth * (i + 1)])
        y_train = y.drop(y.index[one_fifth * i:one_fifth * (i + 1)])
        x_train, y_train, x_valid, y_valid = np.array(x_train), np.array(
            y_train), np.array(x_valid), np.array(y_valid)
        LR = LogisticRegression(initial_theta)
        LR.fit(x_train, y_train, alpha, num_iterations)
        prediction = LR.predict(x_valid)
        sum_accuracy += evaluate_acc(y_valid, prediction)
    return sum_accuracy / 5
Exemple #21
0
def sgd(mus, rates, decays, data, labels, data_train, labels_train,
        data_valid, labels_valid, data_test, labels_test):
    print "starting grid search for SGD"
    validation_results = {}
    dicts = []
    for mu in mus:
        for rate in rates:
            for decay in decays:
                print "trying mu={} rate={} decay={}".format(mu, rate, decay)
                model = LogisticRegression(method="sgd", mu=mu,
                                           rate=rate, decay=decay,
                                           random_state=0)
                model.fit(data_train, labels_train)
                prediction = model.predict(data_valid)
                score = accuracy_score(labels_valid, prediction)
                validation_results[(mu, rate, decay)] = score
                print "  score: {}".format(score)
                print "  error rate: {}".format(1 - score)

                d = dict(method="sgd", mu=mu, rate=rate, decay=decay,
                         score=score, lcl=model.lcl_,
                         rlcl=model.rlcl_, test=False)
                dicts.append(d)

    print "evaluating on test set"
    # get hyperparameters for highest accuracy on validation set
    mu, rate, decay = max(validation_results, key=validation_results.get)
    print "Using mu={} rate={} decay={}".format(mu, rate, decay)

    # train on entire train set and predict on test set
    model = LogisticRegression(method="sgd", mu=mu, rate=rate,
                               decay=decay, random_state=0)
    model.fit(data, labels)
    prediction = model.predict(data_test)
    score = accuracy_score(labels_test, prediction)

    print "SGD test score: {}, error rate: {}".format(score, 1 - score)

    d = dict(method="sgd", mu=mu, rate=rate, decay=decay, score=score,
             lcl=model.lcl_, rlcl=model.rlcl_, test=True)
    dicts.append(d)
    return pd.DataFrame(dicts)
Exemple #22
0
 def test_fit(self):
     model = LogisticRegression(2,
                                epochs=1,
                                update_method=generate_dummy_update(2))
     ws = [
         list(w) for w in model.fit(
             np.array([1, 2, 3])[None].T, np.array([1, 0, 1]))
     ]
     self.assertEqual(len(ws), 3)
     self.assertListEqual(ws[0], ws[2])
     self.assertFalse(list(ws[0]) == list(ws[1]))
def logistic_model(X,
                   y,
                   learning_rate,
                   no_of_iterations,
                   test_split_ratio=0.2):

    # bc = datasets.load_breast_cancer()
    # X, y = bc.data, bc.target

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_split_ratio, random_state=1234)

    regressor = LogisticRegression(learning_rate=0.0001, n_iters=1000)
    regressor.fit(X_train, y_train)
    predictions = regressor.predict(X_test)
    senst, speci, acc = evaluate(y_test, predictions)
    acc = accuracy(y_test, predictions)
    print("LR classification accuracy:", acc)

    return senst, speci, acc
Exemple #24
0
def validate_logistic_regression_for_wine_quality():
    num_of_folds = 5
    learning_rate = 0.000001
    max_iterations = 100
    df = pd.read_csv("../data/winequality/winequality-red.csv", sep=";")
    df['classified'] = [1 if x >= 6 else 0 for x in df["quality"]]
    fold_size = int(round(df.shape[0] / num_of_folds))
    for i in range(num_of_folds):
        x_test = df[[
            'fixed acidity', 'volatile acidity', 'citric acid',
            'residual sugar', 'chlorides', 'free sulfur dioxide',
            'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'
        ]][i * fold_size:fold_size + i * fold_size]
        x_train_part_1 = df[[
            'fixed acidity', 'volatile acidity', 'citric acid',
            'residual sugar', 'chlorides', 'free sulfur dioxide',
            'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'
        ]][fold_size + i * fold_size:]
        x_train_part_2 = df[[
            'fixed acidity', 'volatile acidity', 'citric acid',
            'residual sugar', 'chlorides', 'free sulfur dioxide',
            'total sulfur dioxide', 'density', 'pH', 'sulphates', 'alcohol'
        ]][:i * fold_size]
        x_train = x_train_part_1.append(x_train_part_2)
        print(x_train)
        print(x_test)

        y_test = df[['classified']][i * fold_size:fold_size + i * fold_size]
        y_train_part_1 = df[['classified']][fold_size + i * fold_size:]
        y_train_part_2 = df[['classified']][:i * fold_size]
        y_train = y_train_part_1.append(y_train_part_2)
        print(y_train)
        print(y_test)

        model = LogisticRegression()
        model.fit(learning_rate, max_iterations, np.array(x_train),
                  np.array(y_train))
        y_pred = model.predict(np.array(x_test))
        print(y_pred)
        print("score", model.score(np.array(x_test), np.array(y_test)))
    pass
Exemple #25
0
def lbfgs(mus, data, labels, data_train, labels_train,
          data_valid, labels_valid, data_test, labels_test):
    print "starting grid search for L-BFGS"
    validation_results = {}
    dicts = []
    for mu in mus:
        print "trying mu={}".format(mu)
        model = LogisticRegression(method="lbfgs", mu=mu)
        model.fit(data_train, labels_train)
        prediction = model.predict(data_valid)
        score = accuracy_score(labels_valid, prediction)
        validation_results[mu] = score
        print "  score: {}".format(score)
        print "  error rate: {}".format(1 - score)

        d = dict(method="lbfgs", mu=mu, rate=-1, decay=-1,
                 score=score, lcl=model.lcl_, rlcl=model.rlcl_,
                 test=False)
        dicts.append(d)

    print "evaluating on test set"

    # get hyperparameters for highest accuracy on validation set
    mu = max(validation_results, key=validation_results.get)

    print "Using mu of {}".format(mu)

    # train on entire train set and predict on test set
    model = LogisticRegression(method="lbfgs", mu=mu)
    model.fit(data, labels)
    prediction = model.predict(data_test)
    score = accuracy_score(labels_test, prediction)

    print "L-BFGS test score: {}, error rate: {}".format(score, 1 - score)

    d = dict(method="lbfgs", mu=mu, rate=-1, decay=-1,
             score=score, lcl=model.lcl_, rlcl=model.rlcl_, test=True)
    dicts.append(d)
    return pd.DataFrame(dicts)
    def fit(self, X, y):
        X = np.asarray(X)
        y = np.asarray(y)

        # Convert y to {-1, 1}
        y = self._convert_y(y)

        # Initialise weights for all data points
        row_length = X.shape[0]
        self.weights = np.ones((self.n_estimators, row_length))
        self.alphas = np.zeros((self.n_estimators, 1))
        self.estimators = np.empty((self.n_estimators, 1), dtype=object)

        time_step = 0
        for time_step in range(self.n_estimators):

            # Use a weak classifier to fit on data
            weak_classifier = LogisticRegression(solver="sgd", epochs=5)
            X_weighted = self.weights[time_step].reshape(-1, 1) * X
            weak_classifier.fit(X_weighted, y)
            pred = weak_classifier.predict(X)

            # Get weighted error
            weighted_sample_err = (np.sum(
                (pred != y) * self.weights)) / np.sum(self.weights)

            # Alpha for current classifer
            alpha_t = 1 / 2 * np.log((
                (1 - weighted_sample_err) / weighted_sample_err) + 1e-16)
            self.alphas[time_step] = alpha_t
            self.estimators[time_step] = weak_classifier

            # Update weights of next time step for all data points
            if time_step == (self.n_estimators - 1):
                break
            self.weights[time_step +
                         1, :] = self.weights[time_step, :] * np.exp(
                             -y * alpha_t * pred)
Exemple #27
0
def test_logistic_regression():
    data = datasets.load_iris()
    X = normalize(data.data[data.target != 0])
    y = data.target[data.target != 0]
    y[y == 1] = 0
    y[y == 2] = 1

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        seed=1)

    clf = LogisticRegression(gradient_descent=True)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Reduce dimension to two using PCA and plot the result
    Plot().plot_in_2d(X_test,
                      y_pred,
                      title="Logistic Regression",
                      accuracy=accuracy)
Exemple #28
0
def fitting():
    data = pd.read_csv('microchips.txt', names=['test1', 'test2', 'result'])
    x1 = data['test1']
    x2 = data['test2']
    y = data['result']

    x = map_feature(x1.values, x2.values)

    alpha = 0.1
    max_iter = 1500
    model = LogisticRegression(alpha, max_iter, 0)
    loss, _ = model.fit(x, y, False)

    p = model.predict(x, False)
    p = [1 if i>0.5 else 0 for i in p]
    tp = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 1])
    tn = sum([1.0 for vp, vy in zip(p, y) if vp == vy and vy == 0])
    fp = sum([1.0 for vp, vy in zip(p, y) if vp == 1 and vy == 0 ])
    fn = sum([1.0 for vp, vy in zip(p, y) if vp == 0 and vy == 1])
    print(tp, tn, fp, fn)
    print('Accuracy %.3f' % ((tp + tn)/(tp + tn + fp + fn)))
    print('Precision %.3f' % (tp/(tp + fp)))
    print('Recall %.3f' % (tp/(tp + fn)))

    plt.figure(figsize=(6, 8))
    plt.subplot(2, 1, 1)
    plt.plot(np.arange(1, max_iter+1), loss)

    plt.subplot(2, 1, 2)
    positive = data[data['result'] == 1]
    negative = data[data['result'] == 0]
    plt.plot(positive['test1'], positive['test2'], 'k+')
    plt.plot(negative['test1'], negative['test2'], 'yo')


    x1, x2 = np.mgrid[-1:1.5:50j, -1:1.5:50j]
    p = np.zeros((50, 50))
    for i in range(50):
        for j in range(50):
            x = map_feature(np.array([x1[i, j]]), np.array([x2[i, j]])).squeeze()
            p[i, j] = x.dot(model.w)
    plt.contour(x1, x2, p, [0])


    plt.show()
Exemple #29
0
    # Get a baseline
    base_accuracy = np.mean(y_test == 0)
    #     print('ROC AUC: {:.4f}'.format(roc_auc))
    print('F1 Score: {:.4f}'.format(f1_value))
    print('Accuracy: {:.2f}%'.format(100 * accuracy))
    print('Baseline Accuracy: {:.2f}%'.format(100 * base_accuracy))


from sklearn.model_selection import train_test_split
# Features and target
X = data.copy()
y = X.pop('target')

# Split into training and testing
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=42)
import sys
sys.path.insert(0, '../')

from logistic_regression import LogisticRegression
lr_ = LogisticRegression(learning_rate=.1, gradient_descent=True)
lr_.fit(X_train, y_train)
evaluate(lr_, X_test, y_test)

# from linear_model import LogisticRegression
# lr_1 = LogisticRegression()
# lr_1.fit(X_train, y_train)
# evaluate(lr_1, X_test, y_test)
Exemple #30
0
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"
naive_bayes.fit(X_train, y_train)
print "\tLogistic Regression"
logistic_regression.fit(X_train, y_train)
print "\tMultilayer Perceptron"
mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1)
print "\tPerceptron"
perceptron.fit(X_train, y_train)
print "\tDecision Tree"
decision_tree.fit(X_train, y_train)
print "\tRandom Forest"
random_forest.fit(X_train, y_train)
print "\tSupport Vector Machine"
support_vector_machine.fit(X_train, rescaled_y_train)

# .........
#  PREDICT
# .........
y_pred = {}
Exemple #31
0
from sklearn.cross_validation import train_test_split

# Read the training data
f = open("../data/train.csv")
reader = csv.reader(f)
next(reader, None)  # skip header
data = [data for data in reader]
f.close()

X = np.asarray([x[1:] for x in data], dtype=np.int16)
y = np.asarray([x[0] for x in data], dtype=np.int16)

X = np.true_divide(X, 255)
# normalize image data to 0-1

del data  # free up the memory
print("loaded training data")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=RandomState())

lr = LogisticRegression(C=0.35)
lr.fit(X_train, y_train, 10)
guesses = lr.predict(X_test)

score = 0.0
for g in range(guesses.shape[0]):
    if guesses[g] == y_test[g]:
        score += 1

print("Score: ", score / len(guesses))
cv = KFold(n_splits=5)

results = []
i = 1
# For each permutation of the parameters
for learning_rate, tol, l2_strength in itertools.product(learning_rates, tols, l2_strengths):
    print(f'{i} of 27 permutations...')
    cv = KFold(n_splits=5)
    clf = LogisticRegression(learning_rate=learning_rate, tol=tol, l2_strength=l2_strength, max_iter=500)
    
    scores = [] # to store score from each split
    
    # Compute cross validation score
    for train_index, validation_index in cv.split(X_train):
        #train
        clf.fit(X_train[train_index, :], y_train[train_index])
        #score
        scores.append(clf.score(X_train[validation_index, :], y_train[validation_index]))
    
    # Store the results
    cv_score = np.mean(scores)
    params = {'learning_rate': learning_rate,
              'tol': tol,
              'l2_strength': l2_strength,
              'cv_score': cv_score}
    
    results.append(params)
    i += 1

# Print the parameters that gave the best CV score
best_score = 0
from util import read_file
from logistic_regression import LogisticRegression


data, labels = read_file('../1571/train.txt')
data_train, data_valid, labels_train, labels_valid = \
    train_test_split(data, labels, test_size=0.3, random_state=0)

mus = list(10 ** x for x in range(-8, 2))

sgd_scores = []
for mu in mus:
    sgd_model = LogisticRegression(method="sgd", mu=mu, rate=0.1,
                                   decay=0.6, random_state=0)
    sgd_model.fit(data_train, labels_train)
    predicted = sgd_model.predict(data_valid)
    sgd_scores.append(accuracy_score(labels_valid, predicted))

pp.figure()
pp.xscale('log')
pp.scatter(mus, sgd_scores)
pp.xlabel('regularization strength')
pp.ylabel('accuracy')
pp.savefig('./sgd_regularization.png')


lbfgs_scores = []
for mu in mus:
    sgd_model = LogisticRegression(method="lbfgs", mu=mu, rate=0.1,
                                   decay=0.6, random_state=0)
def evaluate_performance():
    '''
    Evaluate the performance of decision trees and logistic regression,
    average over 1,000 trials of 10-fold cross validation

    Return:
      a matrix giving the performance that will contain the following entries:
      stats[0,0] = mean accuracy of decision tree
      stats[0,1] = std deviation of decision tree accuracy
      stats[1,0] = mean accuracy of logistic regression
      stats[1,1] = std deviation of logistic regression accuracy

    ** Note that your implementation must follow this API**
    '''

    # Load Data
    filename = 'data/SPECTF.dat'
    data = np.loadtxt(filename, delimiter=',')
    X = data[:, 1:]
    y = np.array(data[:, 0])
    n, d = X.shape

    all_accuracies_dt = []
    all_accuracies_lr = []
    all_accuracies_rf = []
    for trial in range(1):
        idx = np.arange(n)
        np.random.shuffle(idx)
        X = X[idx]
        y = y[idx]

        ind = np.arange(X.shape[0])
        classifier_dt = DecisionTree(9)
        classifier_lr = LogisticRegression(max_steps=10000,
                                           epsilon=1e-6,
                                           step_size=1,
                                           l=3)
        classifier_rf = RandomForest(ratio_per_tree=0.5,
                                     num_trees=100,
                                     max_tree_depth=8)
        scores_dt = []
        scores_lr = []
        scores_rf = []
        for i in range(10):
            test_ind = np.random.choice(ind,
                                        int(X.shape[0] / 10),
                                        replace=False)
            ind = np.setdiff1d(np.arange(X.shape[0]), test_ind)
            X_train, Y_train, X_test, Y_test = X[ind], y[ind], X[test_ind], y[
                test_ind]
            # train the decision tree
            classifier_dt.fit(X_train, Y_train)
            accuracy_dt = accuracy_score(
                Y_true=Y_test, Y_predict=classifier_dt.predict(X_test))
            scores_dt.append(accuracy_dt)
            # train the logistic regression
            classifier_lr.fit(
                np.hstack((np.ones(len(X_train)).reshape(len(X_train),
                                                         1), X_train)),
                Y_train)
            accuracy_lr = accuracy_score(Y_true=Y_test,
                                         Y_predict=classifier_lr.predict(
                                             np.hstack(
                                                 (np.ones(len(X_test)).reshape(
                                                     len(X_test),
                                                     1), X_test))))
            scores_lr.append(accuracy_lr)
            # train the random forest
            classifier_rf.fit(X_train, Y_train)
            accuracy_rf = accuracy_score(
                Y_true=Y_test, Y_predict=classifier_rf.predict(X_test)[0])
            scores_rf.append(accuracy_rf)
        all_accuracies_dt.append(np.mean(scores_dt))
        all_accuracies_lr.append(np.mean(scores_lr))
        all_accuracies_rf.append(np.mean(scores_rf))

    # compute the training accuracy of the model
    meanDecisionTreeAccuracy = np.mean(all_accuracies_dt)
    stddevDecisionTreeAccuracy = np.std(all_accuracies_dt)
    meanLogisticRegressionAccuracy = np.mean(all_accuracies_lr)
    stddevLogisticRegressionAccuracy = np.std(all_accuracies_lr)
    meanRandomForestAccuracy = np.mean(all_accuracies_rf)
    stddevRandomForestAccuracy = np.std(all_accuracies_rf)

    # make certain that the return value matches the API specification
    stats = np.zeros((3, 2))
    stats[0, 0] = meanDecisionTreeAccuracy
    stats[0, 1] = stddevDecisionTreeAccuracy
    stats[1, 0] = meanRandomForestAccuracy
    stats[1, 1] = stddevRandomForestAccuracy
    stats[2, 0] = meanLogisticRegressionAccuracy
    stats[2, 1] = stddevLogisticRegressionAccuracy
    return stats
Exemple #35
0
fig = plt.figure(figsize=(8, 6))
plt.scatter(X[:,0], X[:,1], c=y_true)
plt.title("Dataset")
plt.xlabel("First Feature")
plt.ylabel("Second Feature")
plt.show()

y_true = y_true[:, np.newaxis]
X_train, X_test, y_train, y_test =train_test_split(X, y_true)

print(f'Shape X_train: {X_train.shape}')
print(f'Shape y_train: {y_train.shape}')
print(f'Shape X_test: {X_test.shape}')
print(f'Shape y_test: {y_test.shape}')

lr = LogisticRegression()
theta, bias, costs = lr.fit(X_train, y_train, n_iter=500, learning_rate=0.008)

fig = plt.figure(figsize=(8,6))
plt.plot(np.arange(500), costs)
plt.title("Development of cost over training")
plt.xlabel("Number of iterations")
plt.ylabel("Cost")
plt.show()

y_p_train = lr.predict(X_train)
y_p_test = lr.predict(X_test)

print(f"train accuracy: {100 - np.mean(np.abs(y_p_train - y_train)) * 100}%")
print(f"test accuracy: {100 - np.mean(np.abs(y_p_test - y_test))}%")
Exemple #36
0
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import datasets
import matplotlib.pyplot as plt

from logistic_regression import LogisticRegression

bc = datasets.load_breast_cancer()
print(type(bc))
X, y = bc.data, bc.target

X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    random_state=42)


def accuracy(y_true, y_pred):
    return np.sum(y_true == y_pred) / len(y_true)


regressor = LogisticRegression(lr=0.001, n_iters=1000)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)

print(f"LR accuracy: {accuracy(y_test, predictions)}")
Exemple #37
0
mlp = MultilayerPerceptron(n_hidden=20)
perceptron = Perceptron()
decision_tree = DecisionTree()
random_forest = RandomForest(n_estimators=150)
support_vector_machine = SupportVectorMachine(C=1, kernel=rbf_kernel)

# ........
#  TRAIN
# ........
print "Training:"
print "\tAdaboost"
adaboost.fit(X_train, rescaled_y_train)
print "\tNaive Bayes"
naive_bayes.fit(X_train, y_train)
print "\tLogistic Regression"
logistic_regression.fit(X_train, y_train)
print "\tMultilayer Perceptron"
mlp.fit(X_train, y_train, n_iterations=20000, learning_rate=0.1)
print "\tPerceptron"
perceptron.fit(X_train, y_train)
print "\tDecision Tree"
decision_tree.fit(X_train, y_train)
print "\tRandom Forest"
random_forest.fit(X_train, y_train)
print "\tSupport Vector Machine"
support_vector_machine.fit(X_train, rescaled_y_train)

# .........
#  PREDICT
# .........
y_pred = {}