-
Notifications
You must be signed in to change notification settings - Fork 0
/
OpinionMiner.py
116 lines (96 loc) · 3.22 KB
/
OpinionMiner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import pyprind
import pandas as pd
import numpy as np
import os
import re
from sklearn.linear_model import SGDClassifier
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import pickle
from vectorizer import vect
import sqlite3
stop = stopwords.words('english')
def process_file():
pbar = pyprind.ProgBar(50000)
labels = {'pos':1, 'neg':0}
df = pd.DataFrame()
for s in ('test','train'):
for l in ('pos','neg'):
path = './aclImdb/%s/%s' % (s,l)
for file in os.listdir(path):
with open(os.path.join(path,file), 'r') as infile:
txt = infile.read()
df = df.append([[txt, labels[l]]], ignore_index = True)
pbar.update()
df.columns = ['review','sentiment']
# Shuffle the file
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
print('writing file to csv')
df.to_csv('./movie_data.csv', index=False)
return df
def load_df():
df = None
try:
df = pd.read_csv('./movie_data.csv')
except OSError:
print('File does not exist.\n Loading files from tar.')
df = process_file()
print(df.head(3))
return df
def preprocessor(text):
# Remove the HTML markup
text = re.sub('<[^>]*>','',text)
# Extract emoticons
emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text)
# Remove the noses, make everything lowercase and insert the
# Emoticons at the end
text = re.sub('[\W]+', ' ', text.lower()) + ' '.join(emoticons).replace('-', '')
return text
def tokenizer_porter(text):
# Turns words into their 'base' words
porter = PorterStemmer()
return [porter.stem(word) for word in text.split()]
def tokenizer(text):
text = preprocessor(text)
return [w for w in tokenizer_porter(text) if w not in stop]
def stream_docs(path):
with open(path,'r') as csv:
next(csv) # Skip header
for line in csv:
text, label = line[:-3], int(line[-2])
yield text, label
def get_minibatch(doc_stream, size):
docs, y = [],[]
try:
for _ in range(size):
text, label = next(doc_stream)
docs.append(text)
y.append(label)
except StopIteration:
return None, None
return docs, y
def mine():
print("Starting")
clf = SGDClassifier(loss='log',random_state=1,n_iter=1)
print('Create/Load Classifier')
doc_stream = stream_docs(path='./movie_data.csv')
print('Fitting data')
classes = np.array([0,1])
for _ in range(45):
X_train, y_train = get_minibatch(doc_stream, size=1000)
if not X_train:
break
X_train = vect.transform(X_train)
clf.partial_fit(X_train, y_train, classes=classes)
print('Finished Fitting')
X_test, y_test = get_minibatch(doc_stream, size=5000)
X_test = vect.transform(X_test)
print('Accuracy: %.3f' % clf.score(X_test,y_test))
print('create pickle objects')
dest = os.path.join('','pkl_objects')
if not os.path.exists(dest):
os.makedirs(dest)
pickle.dump(stop, open(os.path.join(dest,'stopwords.pkl'),'wb'), protocol=4)
pickle.dump(clf, open(os.path.join(dest,'classifier.pkl'),'wb'), protocol=4)
#mine()