def test_gradient_descent(self): scale = Scale.scale_from_data(self.test_dataset) # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale) self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) weights = gr.run() self.logreg.weights = weights mismatches = 0 predictions = [ self.logreg.predict(test_data) for test_data in self.test_dataset ] print "All predictions: ", predictions for test_data, label in zip(self.test_dataset, self.test_labels): prediction = self.logreg.predict(test_data) if prediction != label: mismatches += 1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", ( 100 - float(mismatches) / float(total) * 100), "%"
def test_gradient_descent(self): scale = Scale.scale_from_data(self.test_dataset) # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale) self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) weights = gr.run() self.logreg.weights = weights mismatches = 0 predictions = [self.logreg.predict(test_data) for test_data in self.test_dataset] print "All predictions: ", predictions for test_data, label in zip(self.test_dataset, self.test_labels): prediction = self.logreg.predict(test_data) if prediction != label: mismatches +=1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%"
def train_model(): training_data = pd.read_pickle("..//data//lemmatized_train_dataframe.pkl") train_features = [] for feature in training_data[0]: feature = feature.lower() train_features.append(feature) targets = [] targets.append(training_data[1]) targets.append(training_data[2]) targets.append(training_data[3]) targets.append(training_data[4]) targets.append(training_data[5]) targets.append(training_data[6]) logis = LogisticRegressor() logis = logis.train_model(train_features, targets) pickle.dump(logis, open("../data/logistic_model.pkl", "wb"))
def predict(): ids, comments = loadQuestionsFromTestDF() logis = LogisticRegressor() final_submission = open("final.csv", "w+") for i, text in enumerate(comments): print(ids[i]) text = str(text).lower() text = re.sub(r'[^\w\s]', '', text) text = remove_stop_words([text.lower()]) text = lemmatize(text[0]) results = logis.predict([text])[0] line = str(ids[i]) + "," for res in results: line += str(round(res, 2)) + "," line = line[0:len(line) - 1] print(line) final_submission.write(line + "\n")
class TestLogisticRegression(unittest.TestCase): samples, labels = dataparser.parse_dataset("/Users/droy/Downloads/dataset.csv") split_index = int(len(samples) * 0.75) train_dataset = samples[:split_index] train_labels = labels[:split_index] test_dataset = samples[split_index:] test_labels = labels[split_index:] test_data = samples[0] def setUp(self): self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) def test_gradient(self): """Just testing that it doesn't throw any errors""" g = self.logreg.get_gradient() print g(ones(len(self.samples[0]) + 1)) def test_gradient_descent(self): scale = Scale.scale_from_data(self.test_dataset) # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale) self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) weights = gr.run() self.logreg.weights = weights mismatches = 0 predictions = [self.logreg.predict(test_data) for test_data in self.test_dataset] print "All predictions: ", predictions for test_data, label in zip(self.test_dataset, self.test_labels): prediction = self.logreg.predict(test_data) if prediction != label: mismatches +=1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%" def test_sckikit_performance(self): from sklearn import linear_model clf = linear_model.LogisticRegression() clf.fit(self.train_dataset, self.train_labels) predictions = clf.predict(self.test_dataset) print "Scikit predictions: ", predictions mismatches = 0 true_positives = 0 true_negatives = 0 false_positives = 0 false_negatives = 0 for test_data, label in zip(self.test_dataset, self.test_labels): prediction = clf.predict(test_data) if prediction != label: mismatches += 1 if prediction == 1 and label == 0: false_positives += 1 if prediction == 0 and label == 1: false_negatives += 1 if prediction == 1 and label == 1: true_positives += 1 if prediction == 0 and label == 0: true_negatives += 1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "True positives: ", true_positives print "True negatives: ", true_negatives print "False negatives: ", false_negatives print "False positives: ", false_positives print "F1 metric: ", 2 * float(true_positives) \ / float(2 * true_positives + false_positives + false_negatives) print "MCC: ", float(true_positives * true_negatives - false_positives * false_negatives) \ / math.sqrt((true_positives + false_positives) * (true_positives + false_negatives) \ * (true_negatives + false_positives) * (true_negatives + false_negatives)) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", (100 - float(mismatches) / float(total) * 100), "%" def test_lets_just_look_at_the_outputs(self): gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) _, weights = gr.run_once() self.logreg.weights = weights predictions = [self.logreg.get_probability(d) for d in self.train_dataset] import pprint pprint.pprint(zip(predictions, self.train_labels))
def setUp(self): self.logreg = LogisticRegressor(self.train_dataset, self.train_labels)
class TestLogisticRegression(unittest.TestCase): samples, labels = dataparser.parse_dataset( "/Users/droy/Downloads/dataset.csv") split_index = int(len(samples) * 0.75) train_dataset = samples[:split_index] train_labels = labels[:split_index] test_dataset = samples[split_index:] test_labels = labels[split_index:] test_data = samples[0] def setUp(self): self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) def test_gradient(self): """Just testing that it doesn't throw any errors""" g = self.logreg.get_gradient() print g(ones(len(self.samples[0]) + 1)) def test_gradient_descent(self): scale = Scale.scale_from_data(self.test_dataset) # self.logreg = LogisticRegressor(self.train_dataset, self.train_labels, scale=scale) self.logreg = LogisticRegressor(self.train_dataset, self.train_labels) gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) weights = gr.run() self.logreg.weights = weights mismatches = 0 predictions = [ self.logreg.predict(test_data) for test_data in self.test_dataset ] print "All predictions: ", predictions for test_data, label in zip(self.test_dataset, self.test_labels): prediction = self.logreg.predict(test_data) if prediction != label: mismatches += 1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", ( 100 - float(mismatches) / float(total) * 100), "%" def test_sckikit_performance(self): from sklearn import linear_model clf = linear_model.LogisticRegression() clf.fit(self.train_dataset, self.train_labels) predictions = clf.predict(self.test_dataset) print "Scikit predictions: ", predictions mismatches = 0 true_positives = 0 true_negatives = 0 false_positives = 0 false_negatives = 0 for test_data, label in zip(self.test_dataset, self.test_labels): prediction = clf.predict(test_data) if prediction != label: mismatches += 1 if prediction == 1 and label == 0: false_positives += 1 if prediction == 0 and label == 1: false_negatives += 1 if prediction == 1 and label == 1: true_positives += 1 if prediction == 0 and label == 0: true_negatives += 1 # print "Mismatch! Predicted ", prediction, ", True ", label total = len(self.test_dataset) print "True positives: ", true_positives print "True negatives: ", true_negatives print "False negatives: ", false_negatives print "False positives: ", false_positives print "F1 metric: ", 2 * float(true_positives) \ / float(2 * true_positives + false_positives + false_negatives) print "MCC: ", float(true_positives * true_negatives - false_positives * false_negatives) \ / math.sqrt((true_positives + false_positives) * (true_positives + false_negatives) \ * (true_negatives + false_positives) * (true_negatives + false_negatives)) print "total data: ", total print "total mismatch: ", mismatches print "percentage success: ", ( 100 - float(mismatches) / float(total) * 100), "%" def test_lets_just_look_at_the_outputs(self): gr = GradientDescentRunner(self.logreg.get_gradient(), len(self.samples[0]) + 1, self.logreg.get_error_function(), alpha=1e-8, max_iter=300) _, weights = gr.run_once() self.logreg.weights = weights predictions = [ self.logreg.get_probability(d) for d in self.train_dataset ] import pprint pprint.pprint(zip(predictions, self.train_labels))
import numpy as np import random from logistic_regression import LogisticRegressor X = np.random.rand(380, 30) y = [0] * 300 + [1] * 80 random.shuffle(y) y = np.array(y) # y = np.reshape(y,(380,1)) # print(y) print("X:\n", X) print("y:\n", y) lr = LogisticRegressor(X, y) lr.train_model(verbose=True)
# Importing libraries and modules import pandas as pd from utils import * from logistic_regression import LogisticRegressor # Reading data into variables. X = pd.read_csv("breast_data.csv", header=None).to_numpy() y = pd.read_csv("breast_truth.csv", header=None).to_numpy() # Splitting data into train/test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) # Creating model model = LogisticRegressor(learning_rate=0.0001, n_iterations=100) model.fit(X_train, y_train) # Predict test data for evaluate model predict_test = model.predict(X_test) print("Accuracy of model on test data: %2.2f" % accuracy(y_test, predict_test))