-
Notifications
You must be signed in to change notification settings - Fork 0
/
DataSet.py
314 lines (228 loc) · 9.23 KB
/
DataSet.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
# Defines a dataset.
# This class defines helper functions for datasets, such as:
# 1) number of lines
# 2) line by line data
# 3) getting class information
# 4) in the future: n-folds
# An object of this type should be fed directly into feature-grabbing functions
# TODO: should we feed it directly into a learning / inference function?
# that means we'd hold features in this class.
import csv
import numpy as np
import os
import LanguageUtils
import nltk
import cPickle as pickle
from nltk.collocations import *
# OTHER_DISTS lists the other grade distributions that each essay set uses, *other* than the resolved grade.
OTHER_DISTS = {
(1, 1): ['rater1_domain1', 'rater2_domain1'],
(2, 1): ['rater1_domain1', 'rater2_domain1', 'domain2_score', 'rater1_domain2', 'rater2_domain2'],
(2, 2): ['rater1_domain1', 'rater2_domain1', 'domain1_score', 'rater1_domain2', 'rater2_domain2'],
(3, 1): ['rater1_domain1', 'rater2_domain1'],
(4, 1): ['rater1_domain1', 'rater2_domain1'],
(5, 1): ['rater1_domain1', 'rater2_domain1'],
(6, 1): ['rater1_domain1', 'rater2_domain1'],
(7, 1): ['rater1_domain1', 'rater2_domain1'],
(8, 1): ['rater1_domain1', 'rater2_domain1'], #, 'rater3_domain1'], <- rater3 is incomplete
}
class DataSet:
def __init__(self, trainSetFlag):
self.colNames = list()
self.ds_size = -1
self.textOnly = list() # just a list of the text
self.trainSetFlag = trainSetFlag
self.domain_id = 1
self.grades = list()
self.prediction_ids = list()
self.essay_ids = list()
self.essay_set = None
self.file_name = ''
self.pos_tags = list()
self.bigram_pos_tags = list()
self.trigram_pos_tags = list()
self.gensim_corpus = ()
self.other_dists_grades = list()
def getFilename(self):
return self.file_name
def importData(self, filename, essay_set=-1, domain_id=1):
"""If essay_set=-1, then we use all essays."""
if domain_id != 1 and domain_id != 2:
raise Exception("Unknown Domain.")
reader = csv.reader(open(filename, 'rb'), delimiter='\t', quotechar=None)
first = True
self.file_name = os.path.basename(filename)
self.domain_id = domain_id
self.essay_set = essay_set
rowmap = dict() # key = col index, value = col_header_name
datamap = dict() # key = col_header_name, value = list of data
for row in reader:
if first:
i = 0
for col in row:
rowmap[i] = col
datamap[col] = list()
i += 1
first = False
else:
i = 0
for col in row:
if rowmap[i] == 'essay':
datamap[rowmap[i]].append(col.strip('"'))
else:
if col:
datamap[rowmap[i]].append(int(col))
else:
datamap[rowmap[i]].append('**GARBAGE**')
i += 1
# get indices that match essay_set
inds = [i for i in range(len(datamap['essay_set']))
if datamap['essay_set'][i] == self.getEssaySet() or self.getEssaySet() == -1]
self.ds_size = len(inds)
# get master grades
domain_col_name = 'domain%d_score' % self.getDomain()
if domain_col_name in datamap:
self.grades = [datamap[domain_col_name][i] for i in inds]
# get textOnly
self.textOnly = [datamap['essay'][i] for i in inds]
# get essay_ids
self.essay_ids = [datamap['essay_id'][i] for i in inds]
# get prediction_ids
pred_col_name = 'domain%d_predictionid' % self.getDomain()
if pred_col_name in datamap:
self.prediction_ids = [datamap[pred_col_name][i] for i in inds]
# get other_dists
if (self.isTrainSet()):
other_dists_cols = OTHER_DISTS[(self.getEssaySet(), self.getDomain())]
self.other_dists_grades = list()
for dist_col_name in other_dists_cols:
self.other_dists_grades.append([datamap[dist_col_name][i] for i in inds])
else:
self.other_dists_grades = list()
return
def getOtherDistsGrades(self):
return self.other_dists_grades
def getAllBoW(self):
"""Return unigram, bigram words as bag of words."""
fname = 'cache/word_ngrams.%s.set%d.dom%d.pickle' % (
self.getFilename(), self.getEssaySet(), self.getDomain())
try:
f = open(fname, 'rb')
bow = pickle.load(f)
except:
bigram_measures = nltk.collocations.BigramAssocMeasures()
bow = list()
for line in self.getRawText():
cur = LanguageUtils.tokenize(line)
finder = BigramCollocationFinder.from_words(cur)
scored = finder.score_ngrams(bigram_measures.pmi)
for bigram, score in scored:
cur.append(bigram)
bow.append(cur)
pickle.dump(bow, open(fname, 'w'))
return bow
def getAllPOS(self):
"""Return unigram, bigram, trigram POS tags as bag of words."""
fname = 'cache/pos_ngrams.%s.set%d.dom%d.pickle' % (
self.getFilename(), self.getEssaySet(), self.getDomain())
try:
f = open(fname, 'rb')
all = pickle.load(f)
except:
uni = self.getPOS()
bi = self.getBigramPOS()
tri = self.getTrigramPOS()
all = list()
for i in range(self.size()):
all.append(uni[i] + bi[i] + tri[i])
pickle.dump(all, open(fname, 'w'))
return all
def getPOS(self):
if len(self.pos_tags) > 0:
return self.pos_tags
fname = 'cache/pos.%s.set%d.pickle' % (self.file_name, self.essay_set)
try:
f = open(fname, 'rb')
self.pos_tags = pickle.load(f)
except:
pos_lines = list()
tot_ln = self.size()
prog = 0
hunpos = nltk.tag.HunposTagger("en_wsj.model")
for line in self.getRawText():
tokens = LanguageUtils.punkt_tokenize(line)
pos_tags = hunpos.tag(tokens)
tags_only = [tag for w, tag in pos_tags]
pos_lines.append(tags_only)
prog += 1
if prog % 100 == 0:
print "POS Tagging %d of %d" % (prog, tot_ln)
self.pos_tags = pos_lines
f = open(fname, 'w')
pickle.dump(self.pos_tags, f)
return self.pos_tags
def getBigramPOS(self):
if len(self.bigram_pos_tags) > 0:
return self.bigram_pos_tags
bigram_measures = nltk.collocations.BigramAssocMeasures()
bigram_tags = list()
for tokens in self.pos_tags:
finder = BigramCollocationFinder.from_words(tokens)
cur_scored = finder.score_ngrams(bigram_measures.pmi)
bigrams = list()
for bigram, score in cur_scored:
bigrams.append(bigram)
bigram_tags.append(bigrams)
self.bigram_pos_tags = bigram_tags
return self.bigram_pos_tags
def getTrigramPOS(self):
if len(self.trigram_pos_tags) > 0:
return self.trigram_pos_tags
trigram_measures = nltk.collocations.TrigramAssocMeasures()
trigram_tags = list()
for tokens in self.pos_tags:
finder = TrigramCollocationFinder.from_words(tokens)
cur_scored = finder.score_ngrams(trigram_measures.pmi)
trigrams = list()
for trigram, score in cur_scored:
trigrams.append(trigram)
trigram_tags.append(trigrams)
self.trigram_pos_tags = trigram_tags
return self.trigram_pos_tags
def setTrainSet(self, val):
self.trainSetFlag = val
return
def isTrainSet(self):
return self.trainSetFlag
def size(self):
return self.ds_size
def dumpColNames(self):
print '\n'.join(self.colNames)
def setGensimCorpus(self, mm):
self.gensim_corpus = mm
def getGensimCorpus(self):
return self.gensim_corpus
def setGensimPOSCorpus(self, mm):
self.gensim_pos_corpus = mm
def getGensimPOSCorpus(self):
return self.gensim_pos_corpus
def getRawText(self):
return self.textOnly
def getEssaySet(self):
return self.essay_set
def getDomain(self):
return self.domain_id
def outputKaggle(self, grades, fd):
"""Output standard Kaggle validation set format. file will be appended to."""
predweight = '1'
if self.essay_set == 2:
predweight = '0.5'
for i in range(self.size()):
str = "%d\t%d\t%d\t%s\t%d\n" % (self.prediction_ids[i], self.essay_ids[i], self.essay_set, predweight, grades[i])
fd.write(str)
# Returns a numpy array of the rades
def getGrades(self):
# TODO throw exception if grades is empty
return np.asarray(self.grades)
def setGrades(self, grades):
self.grades = grades