Example #1
0
    def test_gradient_descent(self):
        scale = Scale.scale_from_data(self.test_dataset)
        # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale)

        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

        gr = GradientDescentRunner(self.logreg.get_gradient(),
                                   len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(),
                                   alpha=1e-8,
                                   max_iter=300)
        weights = gr.run()
        self.logreg.weights = weights

        mismatches = 0
        predictions = [
            self.logreg.predict(test_data) for test_data in self.test_dataset
        ]
        print "All predictions: ", predictions

        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = self.logreg.predict(test_data)
            if prediction != label:
                mismatches += 1
                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (
            100 - float(mismatches) / float(total) * 100), "%"
    def test_gradient_descent(self):
        scale = Scale.scale_from_data(self.test_dataset)
        # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale)

        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

        gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(), alpha=1e-8, max_iter=300)
        weights = gr.run()
        self.logreg.weights = weights

        mismatches = 0
        predictions = [self.logreg.predict(test_data) for test_data in self.test_dataset]
        print "All predictions: ", predictions

        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = self.logreg.predict(test_data)
            if prediction != label:
                mismatches +=1
                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%"
def train_model():
    training_data = pd.read_pickle("..//data//lemmatized_train_dataframe.pkl")

    train_features = []
    for feature in training_data[0]:
        feature = feature.lower()
        train_features.append(feature)

    targets = []
    targets.append(training_data[1])
    targets.append(training_data[2])
    targets.append(training_data[3])
    targets.append(training_data[4])
    targets.append(training_data[5])
    targets.append(training_data[6])

    logis = LogisticRegressor()
    logis = logis.train_model(train_features, targets)

    pickle.dump(logis, open("../data/logistic_model.pkl", "wb"))
def predict():
    ids, comments = loadQuestionsFromTestDF()
    logis = LogisticRegressor()
    final_submission = open("final.csv", "w+")

    for i, text in enumerate(comments):
        print(ids[i])
        text = str(text).lower()
        text = re.sub(r'[^\w\s]', '', text)
        text = remove_stop_words([text.lower()])
        text = lemmatize(text[0])
        results = logis.predict([text])[0]

        line = str(ids[i]) + ","
        for res in results:
            line += str(round(res, 2)) + ","

        line = line[0:len(line) - 1]
        print(line)
        final_submission.write(line + "\n")
class TestLogisticRegression(unittest.TestCase):
    samples, labels = dataparser.parse_dataset("/Users/droy/Downloads/dataset.csv")
    split_index = int(len(samples) * 0.75)

    train_dataset = samples[:split_index]
    train_labels = labels[:split_index]

    test_dataset = samples[split_index:]
    test_labels = labels[split_index:]

    test_data = samples[0]
    def setUp(self):
        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

    def test_gradient(self):
        """Just testing that it doesn't throw any errors"""
        g = self.logreg.get_gradient()
        print g(ones(len(self.samples[0]) + 1))

    def test_gradient_descent(self):
        scale = Scale.scale_from_data(self.test_dataset)
        # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale)

        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

        gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(), alpha=1e-8, max_iter=300)
        weights = gr.run()
        self.logreg.weights = weights

        mismatches = 0
        predictions = [self.logreg.predict(test_data) for test_data in self.test_dataset]
        print "All predictions: ", predictions

        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = self.logreg.predict(test_data)
            if prediction != label:
                mismatches +=1
                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%"

    def test_sckikit_performance(self):
        from sklearn import linear_model
        clf = linear_model.LogisticRegression()
        clf.fit(self.train_dataset, self.train_labels)
        predictions = clf.predict(self.test_dataset)
        print "Scikit predictions: ", predictions

        mismatches = 0
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0
        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = clf.predict(test_data)

            if prediction != label:
                mismatches += 1

            if prediction == 1 and label == 0:
                false_positives += 1

            if prediction == 0 and label == 1:
                false_negatives += 1

            if prediction == 1 and label == 1:
                true_positives += 1

            if prediction == 0 and label == 0:
                true_negatives += 1

                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "True positives: ", true_positives
        print "True negatives: ", true_negatives
        print "False negatives: ", false_negatives
        print "False positives: ", false_positives

        print "F1 metric: ", 2 * float(true_positives) \
                             / float(2 * true_positives + false_positives + false_negatives)

        print "MCC: ", float(true_positives * true_negatives - false_positives * false_negatives) \
                    / math.sqrt((true_positives + false_positives) * (true_positives + false_negatives) \
                                * (true_negatives + false_positives) * (true_negatives + false_negatives))

        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%"


    def test_lets_just_look_at_the_outputs(self):
        gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(), alpha=1e-8, max_iter=300)

        _, weights = gr.run_once()
        self.logreg.weights = weights

        predictions = [self.logreg.get_probability(d) for d in self.train_dataset]
        import pprint
        pprint.pprint(zip(predictions, self.train_labels))
 def setUp(self):
     self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)
Example #7
0
class TestLogisticRegression(unittest.TestCase):
    samples, labels = dataparser.parse_dataset(
        "/Users/droy/Downloads/dataset.csv")
    split_index = int(len(samples) * 0.75)

    train_dataset = samples[:split_index]
    train_labels = labels[:split_index]

    test_dataset = samples[split_index:]
    test_labels = labels[split_index:]

    test_data = samples[0]

    def setUp(self):
        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

    def test_gradient(self):
        """Just testing that it doesn't throw any errors"""
        g = self.logreg.get_gradient()
        print g(ones(len(self.samples[0]) + 1))

    def test_gradient_descent(self):
        scale = Scale.scale_from_data(self.test_dataset)
        # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale)

        self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)

        gr = GradientDescentRunner(self.logreg.get_gradient(),
                                   len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(),
                                   alpha=1e-8,
                                   max_iter=300)
        weights = gr.run()
        self.logreg.weights = weights

        mismatches = 0
        predictions = [
            self.logreg.predict(test_data) for test_data in self.test_dataset
        ]
        print "All predictions: ", predictions

        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = self.logreg.predict(test_data)
            if prediction != label:
                mismatches += 1
                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (
            100 - float(mismatches) / float(total) * 100), "%"

    def test_sckikit_performance(self):
        from sklearn import linear_model
        clf = linear_model.LogisticRegression()
        clf.fit(self.train_dataset, self.train_labels)
        predictions = clf.predict(self.test_dataset)
        print "Scikit predictions: ", predictions

        mismatches = 0
        true_positives = 0
        true_negatives = 0
        false_positives = 0
        false_negatives = 0
        for test_data, label in zip(self.test_dataset, self.test_labels):
            prediction = clf.predict(test_data)

            if prediction != label:
                mismatches += 1

            if prediction == 1 and label == 0:
                false_positives += 1

            if prediction == 0 and label == 1:
                false_negatives += 1

            if prediction == 1 and label == 1:
                true_positives += 1

            if prediction == 0 and label == 0:
                true_negatives += 1

                # print "Mismatch! Predicted ", prediction, ", True ", label

        total = len(self.test_dataset)
        print "True positives: ", true_positives
        print "True negatives: ", true_negatives
        print "False negatives: ", false_negatives
        print "False positives: ", false_positives

        print "F1 metric: ", 2 * float(true_positives) \
                             / float(2 * true_positives + false_positives + false_negatives)

        print "MCC: ", float(true_positives * true_negatives - false_positives * false_negatives) \
                    / math.sqrt((true_positives + false_positives) * (true_positives + false_negatives) \
                                * (true_negatives + false_positives) * (true_negatives + false_negatives))

        print "total data: ", total
        print "total mismatch: ", mismatches
        print "percentage success: ", (
            100 - float(mismatches) / float(total) * 100), "%"

    def test_lets_just_look_at_the_outputs(self):
        gr = GradientDescentRunner(self.logreg.get_gradient(),
                                   len(self.samples[0]) + 1,
                                   self.logreg.get_error_function(),
                                   alpha=1e-8,
                                   max_iter=300)

        _, weights = gr.run_once()
        self.logreg.weights = weights

        predictions = [
            self.logreg.get_probability(d) for d in self.train_dataset
        ]
        import pprint
        pprint.pprint(zip(predictions, self.train_labels))
Example #8
0
 def setUp(self):
     self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)
Example #9
0
import numpy as np
import random
from logistic_regression import LogisticRegressor

X = np.random.rand(380, 30)
y = [0] * 300 + [1] * 80
random.shuffle(y)
y = np.array(y)
# y = np.reshape(y,(380,1))
# print(y)
print("X:\n", X)
print("y:\n", y)

lr = LogisticRegressor(X, y)

lr.train_model(verbose=True)
# Importing libraries and modules
import pandas as pd
from utils import *
from logistic_regression import LogisticRegressor

# Reading data into variables.
X = pd.read_csv("breast_data.csv", header=None).to_numpy()
y = pd.read_csv("breast_truth.csv", header=None).to_numpy()

# Splitting data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Creating model
model = LogisticRegressor(learning_rate=0.0001, n_iterations=100)
model.fit(X_train, y_train)

# Predict test data for evaluate model
predict_test = model.predict(X_test)

print("Accuracy of model on test data: %2.2f" % accuracy(y_test, predict_test))