/
LogisticClassifier.py
157 lines (116 loc) · 6.85 KB
/
LogisticClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
## Script used to analyze sentiment
# import sklearn as sk
import pandas
# import sframe
import json
import numpy as np
from LogisticClassifierHelpers import get_numpy_data,logistic_regression,
logistic_regression_with_L2,make_coefficient_plot
products = pandas.read_csv('C:\Users\Isaac\Course 3/amazon_baby_subset.csv')
print '# of positive reviews =', len(products[products['sentiment']==1])
print '# of negative reviews =', len(products[products['sentiment']==-1])
# Reading the json file of important words
with open('C:\Users\Isaac\Course 3/important_words.json', 'r') as f:
important_words = json.load(f)
important_words = [str(s) for s in important_words]
products = products.fillna({'review':''})
#Cleaning the review column
def remove_punctuation(text):
import string
return text.translate(None, string.punctuation)
products['review_clean'] = products['review'].apply(remove_punctuation)
#Iterating over the words in important words
for word in important_words:
products[word] = products['review_clean'].apply(lambda s : s.split().count(word))
# number of reviews with the word perfect
sum(products['perfect']>0) # 2955
feature_matrix, sentiment = get_numpy_data(products, important_words, 'sentiment')
feature_matrix.shape #193 features
# Computing the optimal coefficients
coefficients = logistic_regression(feature_matrix, sentiment, initial_coefficients=np.zeros(194),
step_size=1e-7, max_iter=301)
# Computing the scores for those coefficients
scores = np.dot(feature_matrix, coefficients)
# classifying the scores
negatives = (scores <= 0).astype(int)*(-1) #27946
positives = (scores > 0).astype(int) #25126
predictions = negatives+positives
#Computing accuracy
num_correct = sum(sentiment==predictions)
accuracy = (sum(sentiment==predictions)+0.0)/len(sentiment)
print "-----------------------------------------------------"
print '# Reviews correctly classified =', num_correct
print '# Reviews incorrectly classified =', len(products) - num_correct
print '# Reviews total =', len(products)
print "-----------------------------------------------------"
print 'Accuracy = %.2f' % accuracy
#0.75
# Finding the most positive words
coefficients = list(coefficients[1:]) # exclude intercept
word_coefficient_tuples = [(word, coefficient) for word, coefficient in zip(important_words, coefficients)]
word_coefficient_tuples = sorted(word_coefficient_tuples, key=lambda x:x[1], reverse=True)
#
with open('C:\Users\Isaac\Course 3/module-4-assignment-train-idx.json', 'r') as f:
hold1 = json.load(f)
hold1 = [int(i) for i in hold1]
with open('C:\Users\Isaac\Course 3/module-4-assignment-validation-idx.json', 'r') as f:
hold2 = json.load(f)
hold2 = [int(i) for i in hold2]
train_data=products.loc[hold1]
validation_data=products.loc[hold2]
feature_matrix_train, sentiment_train = get_numpy_data(train_data, important_words, 'sentiment')
feature_matrix_valid, sentiment_valid = get_numpy_data(validation_data, important_words, 'sentiment')
# Analysis of Benefits of using L2-regularization
l21=0
l22=4
l23=10
l24=1e2
l25=1e3
l26=1e5
step_size=5e-6
max_iter=501
initial_coefficients=np.zeros(194)
coefficients_0_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l21, step_size, max_iter)
coefficients_4_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l22, step_size, max_iter)
coefficients_10_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l23, step_size, max_iter)
coefficients_1e2_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l24, step_size, max_iter)
coefficients_1e3_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l25, step_size, max_iter)
coefficients_1e5_penalty= logistic_regression_with_L2(feature_matrix_train, sentiment_train, initial_coefficients,l26, step_size, max_iter)
top_5=np.sort(coefficients_0_penalty)[-5:]
bottom_5=np.sort(coefficients_0_penalty)[:5]
table = pandas.DataFrame({'word': ['(intercept)'] + important_words})
def add_coefficients_to_table(coefficients, column_name):
table[column_name] = coefficients
return table
add_coefficients_to_table(coefficients_0_penalty, 'coefficients [L2=0]')
add_coefficients_to_table(coefficients_4_penalty, 'coefficients [L2=4]')
add_coefficients_to_table(coefficients_10_penalty, 'coefficients [L2=10]')
add_coefficients_to_table(coefficients_1e2_penalty, 'coefficients [L2=1e2]')
add_coefficients_to_table(coefficients_1e3_penalty, 'coefficients [L2=1e3]')
add_coefficients_to_table(coefficients_1e5_penalty, 'coefficients [L2=1e5]')
table[table['coefficients [L2=0]']==top_5]
sorted_table=table.sort_values('coefficients [L2=0]')
positive_words=sorted_table[-5:]['word'].tolist()
negative_words=sorted_table[:5]['word'].tolist()
ind_neg=[106,97,114,113,169]
ind_pos=[3,34,8,23,4]
make_coefficient_plot(table, positive_words, negative_words, l2_penalty_list=[0, 4, 10, 1e2, 1e3, 1e5])
train_accuracy = {}
train_accuracy[0] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_0_penalty)
train_accuracy[4] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_4_penalty)
train_accuracy[10] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_10_penalty)
train_accuracy[1e2] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e2_penalty)
train_accuracy[1e3] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e3_penalty)
train_accuracy[1e5] = get_classification_accuracy(feature_matrix_train, sentiment_train, coefficients_1e5_penalty)
validation_accuracy = {}
validation_accuracy[0] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_0_penalty)
validation_accuracy[4] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_4_penalty)
validation_accuracy[10] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_10_penalty)
validation_accuracy[1e2] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e2_penalty)
validation_accuracy[1e3] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e3_penalty)
validation_accuracy[1e5] = get_classification_accuracy(feature_matrix_valid, sentiment_valid, coefficients_1e5_penalty)
# Build a simple report
for key in sorted(validation_accuracy.keys()):
print "L2 penalty = %g" % key
print "train accuracy = %s, validation_accuracy = %s" % (train_accuracy[key], validation_accuracy[key])
print "--------------------------------------------------------------------------------"