-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_importer_mine2.py
300 lines (270 loc) · 12.4 KB
/
data_importer_mine2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
import numpy as np
import re
import itertools
from collections import Counter
import csv
import pandas as pd
numbers = [0, 9, 8, 7, 6, 5, 4, 3, 2, 1]
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
def write_predictions(output):
with open("predictions.csv", 'w') as f:
writer = csv.writer(f)
ids = [x[0] for x in output] ##Open up sentences
ids = [a.replace('\n', '') for a in ids] ##Strips out linebreak
real_codes = [b[1] for b in output] ##Open up real_codes in similar format
pred_codes = [c[2] for c in output]
results = zip(ids, real_codes, pred_codes) ##Zip it up! Ready to write
writer.writerows(results)
def split_master_data_into_seperate_files():
"""
Takes the original datafile and uses Pandas to save it to seperate csvs for easier
handling and clearning in later steps.
"""
data = pd.read_csv('./data/agendas_data.csv')
data.content_coding.to_csv('./data/content_coding.csv', index=False)
data.id.to_csv('./data/id.csv', index=False)
data.dataset.to_csv('./data/dataset.csv', index=False)
data.tekst.to_csv('./data/tekst.csv', index=False)
data.var1.to_csv('./data/var1.csv', index=False)
data.var3.to_csv('./data/var3.csv', index=False)
data.var4.to_csv('./data/var4.csv', index=False)
data.var5.to_csv('./data/var5.csv', index=False)
data.var6.to_csv('./data/var6.csv', index=False)
data.var7.to_csv('./data/var7.csv', index=False)
def clean_str(string):
"""
Tokenization/string cleaning for all datasets except for SST.
Original taken from https://github.com/yoonkim/CNN_sentence/blob/master/process_data.py
"""
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_and_labels():
"""
Generalized load function. Reads texts, cleans them and reads codes.
Right now used in train binary.py
"""
texts = list(open('./data/tekst.csv').readlines())
texts = [clean_str(sentence) for sentence in texts]
print("removed punctuation")
texts = [s.strip() for s in texts]
print("")
x_text = texts
codes = list(open('./data/var7.csv').readlines())
codes = [s.strip() for s in codes]
global real_codes
real_codes = codes
global dictionary_of_codes
dictionary_of_codes_reverse = {}
token_codes = [dictionary_of_codes_reverse[i] for i in codes if dictionary_of_codes_reverse.setdefault(i,len(dictionary_of_codes_reverse)+1)] ##Create one_hot vector
dictionary_of_codes = dict((v,k) for k,v in dictionary_of_codes_reverse.iteritems())
token_codes_vector = np.eye(5)[token_codes].tolist()
y = token_codes_vector
return [x_text, y, real_codes, dictionary_of_codes]
def quick_load_data_and_labels():
"""
Custom load routine :D
Loads from already stemmed & padded sentences
"""
texts = list(open('./data/preprocessed_text.csv').readlines())
print("read text file")
#texts = [clean_str(sentence) for sentence in texts]
texts = [s.strip() for s in texts]
print("stripping texts")
x_text = texts
codes = list(open('./data/var7.csv').readlines())
print("Read codes files")
codes = [s.strip()for s in codes]
global real_codes
real_codes = codes
global dictionary_of_codes
print("Reading files done!")
dictionary_of_codes_reverse = {}
token_codes = [dictionary_of_codes_reverse[i] for i in codes if dictionary_of_codes_reverse.setdefault(i,len(dictionary_of_codes_reverse)+1)] ##Create one_hot vector
print("token_codes created!")
dictionary_of_codes = dict((v,k) for k,v in dictionary_of_codes_reverse.iteritems())
token_codes_vector = np.eye(5, dtype=np.int16)[token_codes].tolist()
print("Created one_hot vectors")
y = token_codes_vector
return [x_text, y, real_codes, dictionary_of_codes]
def text_cleaner_and_tokenizer(texts):
"""
takes a list of sentences, removes punctuation, numbers, stopwords and stems.
Then joins everything back together and returns the filtered texts as a list of unicode strings
:param texts: list of unprocessed strings
:return: list of unicode strings
"""
i = 0
stopword_list = set(stopwords.words('danish'))
stemmer = SnowballStemmer("danish", ignore_stopwords=False)
filtered_texts = []
for sentence in texts:
"""
for symbol in punctuation:
sentence = sentence.replace(symbol,'')
"""
for num in numbers:
sentence = sentence.replace(str(num),'')
sentence = sentence.decode('utf-8').lower()
words_in_sentence = word_tokenize(sentence, language='danish')
filtered_sentence = []
for word in words_in_sentence:
if word not in stopword_list:
stem_word = stemmer.stem(word)
filtered_sentence.append(stem_word)
sentence = ' '.join(filtered_sentence)
filtered_texts.append(sentence)
i = i +1
if i % 1000 == 0:
print(i)
print('Done :D!')
return filtered_texts
def pad_sentences(sentences):
"""
Pads all sentences to the same length. The length is defined by the longest sentence.
Returns padded sentences.
"""
sequence_length = 16 ##max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
sentence = re.sub("[^\w]", " ", sentence).split() ##ADAM: Proper sentence split
sentence = sentence[:15] ##ADAM: Limit sentence
num_padding = sequence_length - len(sentence)
for j in range(num_padding): ##ADAM: Proper padding
sentence.append(' <P> ')
##new_sentence = sentence + [padding_word] * num_padding
sentence = ' '.join(sentence)
padded_sentences.append(sentence)
if i % 1000 == 0: ##ADAM: See what is printed :D
print('Padding sentence: ' + str(i))
return padded_sentences
def build_vocab(sentences):
"""
Builds a vocabulary mapping from word to index based on the sentences.
Returns vocabulary mapping and inverse vocabulary mapping.
"""
# Build vocabulary
print("Starting vocabulary")
word_counts = Counter(itertools.chain(*sentences))
# Mapping from index to word
vocabulary_inv = [x[0] for x in word_counts.most_common()]
# Mapping from word to index
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
print("Vocabulary done!")
return [vocabulary, vocabulary_inv]
def build_input_data(sentences, labels, vocabulary):
"""
Maps sentencs and labels to vectors based on a vocabulary.
"""
x = np.array([[vocabulary[word] for word in sentence] for sentence in sentences]) ##Might be useful if I ever need to revert back to sentences
y = np.array(labels)
return [x, y]
def load_data():
"""
Loads and preprocessed data for the MR dataset.
Returns input vectors, labels, vocabulary, and inverse vocabulary.
"""
sentences, labels, real_codes, dictionary_of_codes = load_data_and_labels() ##Actual loading of data
sentences_stemmed = text_cleaner_and_tokenizer(sentences)
sentences_padded = pad_sentences(sentences_stemmed) ##Runs padding
sentences_padded = [a.split() for a in sentences_padded] ## Transforms padded sentences back into list of list of words. Needed for efficient vocab building
vocabulary, vocabulary_inv = build_vocab(sentences_padded) ##Builds vocabulary
x, y = build_input_data(sentences_padded, labels, vocabulary) ##Constructs the array
id = list(open('./data/id.csv').readlines())
with open('./data/preprocessed_text.csv', 'w') as file:
sentences_padded = [' '.join(a) for a in sentences_padded]
file.write('\n'.join(sentences_padded))
print("Updated preprocessed texts :D!")
return [id, x, y, vocabulary, vocabulary_inv, real_codes, dictionary_of_codes]
def quick_load_data():
"""
Loads pre-padded and stemmed sentences to save time
"""
sentences, labels, real_codes, dictionary_of_codes = quick_load_data_and_labels() ##Actual loading of data
print("Successfully loaded preprocessed data")
sentences = [a.split() for a in sentences] ## Transforms padded sentences back into list of list of words. Needed for efficient vocab building
print("Split padding done!")
vocabulary, vocabulary_inv = build_vocab(sentences) ##Builds vocabulary
x, y = build_input_data(sentences, labels, vocabulary) ##Constructs the array
id = list(open('./data/id.csv').readlines())
return [id, x, y, vocabulary, vocabulary_inv, real_codes, dictionary_of_codes]
def get_non_missing(ids, x, y, real_codes):
"""
Takes lists of the data and removes missing data!
:param ids:
:param x:
:param y:
:param real_codes:
:return:
"""
dataset = zip(ids, x, y, real_codes)
dataset = np.array(dataset, dtype=object)
non_miss = dataset[~(dataset[:,3] == '""')]
id_clean = non_miss[:,0].tolist() ##Takes first column of non_missing matrix to writes it to a list
text_clean = non_miss[:,1]
code_clean = non_miss[:,2]
real_codes_clean = non_miss[:,3].tolist()
real_codes_clean = [float(i) for i in real_codes_clean] ##Turns real_codes into floats for memory efficiency
real_codes_clean = np.array(real_codes_clean)
text_clean = np.stack(text_clean, axis=0) ## Makes everything a 2D array instead of array of arrays...
code_clean = np.stack(code_clean, axis=0)
return [id_clean, text_clean, code_clean, real_codes_clean]
def get_missing(ids, x, y, real_codes):
"""
Takes lists of the data, and returns the lists with only missing data
Useful for serious prediction work!
:param ids:
:param x:
:param y:
:param real_codes:
:return:
"""
dataset = zip(ids, x, y, real_codes)
dataset = np.array(dataset, dtype=object)
miss = dataset[~(dataset[:,3] != '""')]
id_miss = miss[:,0].tolist()
text_miss = miss[:,1]
code_miss = miss[:,2]
real_codes_miss = miss[:,3].tolist()
return [id_miss, text_miss, code_miss, real_codes_miss]
def batch_iter(data, batch_size, num_epochs):
"""
Generates a batch iterator for a dataset.
FOR THE BINARY CASE!!!
Balances the batches by drawing a limitd random sample of 0-cases matching the number of 1-cases
Means I train on all 1-cases each epoch, and a limited number of 0-cases.
"""
data = np.array(data)
data1 = data[~(data[:,2] == 1.0)] ## Uses real_codes to split into 1-cases and 0-cases.
data0 = data[~(data[:,2] == 0.0)]
sample_size = len(data1)
data0random_sample = data0[np.random.randint(data0.shape[0],size=sample_size)] ## Samples random 0-cases
data = np.vstack((data1, data0random_sample)) ## Combines 1-cases with 0-cases sample
data = np.delete(data, 2, axis=1) ## Removes real_codes column
np.random.shuffle(data)
data_size = len(data) ## Now we have a new (balanced) dataset!
num_batches_per_epoch = int(len(data)/batch_size) + 1
for epoch in range(num_epochs):
# Shuffle the data at each epoch
shuffle_indices = np.random.permutation(np.arange(data_size))
shuffled_data = data[shuffle_indices]
for batch_num in range(num_batches_per_epoch):
start_index = batch_num * batch_size
end_index = min((batch_num + 1) * batch_size, data_size)
one_batch = shuffled_data[start_index:end_index]
yield shuffled_data[start_index:end_index]
print("YAY! and stop")