-
Notifications
You must be signed in to change notification settings - Fork 2
/
feature_extraction.py
371 lines (330 loc) · 14.7 KB
/
feature_extraction.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
from __future__ import division
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from preprocess import preprocess, lemmatize, tokenize
from nltk.corpus import stopwords
import operator
import cPickle as pickle
from math import log10
from collections import Counter
stopwords = set(stopwords.words('english'))
features = ['tfidf', 'first_occurrence', 'entropy', 'length', 'num_tokens']
# Takes as input a list of doc tokens (nested list)
# For instance, with two docs: [[token_1, token_2], [token_3, token_4, token_5]]
def extract_features(docs, keys):
#print "calculating entropy for phrases"
#entropy_all = [get_phrase_entropy(doc, phrase_list) for doc in docs]
tfidf_matrix, phrase_list, first_occurrence_all, entropy_all, idf_vec = get_tfidf_matrix(docs)
X, y = get_feature_matrix(tfidf_matrix, phrase_list, keys, first_occurrence_all, entropy_all)
return X, y, phrase_list, idf_vec
# extract candidates from a single doc
# phrase_list and idf_vec are from training set
def extract_candidates_doc(doc, phrase_list, idf_vec, training_size = 450):
#vocab = set(phrase_list)
idf_dic = {}
#print "phrase list len", len(phrase_list)
#print "len idf_vec", len(idf_vec)
for i, phrase in enumerate(phrase_list):
idf_dic[phrase] = idf_vec[i]
noun_phrases = set()
print "--extracting NP"
noun_phrases = set([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])
vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
analyzer = vectorizer.build_analyzer()
phrases = list(set([phrase for phrase in analyzer(doc) if valid_ngram(phrase, noun_phrases)]))
doc = preprocess(doc)
#print "candidate phrases", phrases
#tfidf = []
#first_occurrence = []
#entropy = []
#length = []
doc_len = len(doc)
entropy = get_entropy_doc(doc, phrases)
# get feature vectors
features = []
for i, phrase in enumerate(phrases):
first_occurrence = doc.find(phrase) / doc_len
tf = doc.count(phrase)
if phrase in idf_dic:
tfidf = tf * idf_dic[phrase]
else:
tfidf = tf * log10(training_size)
feature_vec = get_feature_vector(phrase, tfidf, first_occurrence, entropy[i])
features.append(feature_vec)
return phrases, features
# Takes as input a list of doc tokens (nested list)
# For instance, with two docs: [[token_1, token_2], [token_3, token_4, token_5]]
def extract_features_test(docs, keys):
tfidf_matrix, phrase_list, first_occurrence_all, entropy_all, idf_vec = get_tfidf_matrix(docs)
#entropy_all = [get_phrase_entropy(doc, phrase_list) for doc in docs]
features_doc, labels_doc, phrase_idx_doc, phrase_list = get_candidates_for_docs(tfidf_matrix, phrase_list, keys, first_occurrence_all, entropy_all)
return features_doc, labels_doc, phrase_idx_doc, phrase_list
# extract noun phrases from text
# implemented by Burton DeWilde, http://bdewilde.github.io/blog/2014/09/23/intro-to-automatic-keyphrase-extraction/
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'):
import itertools, nltk, string
# exclude candidates that are stop words or entirely punctuation
punct = set(string.punctuation)
# tokenize, POS-tag, and chunk using regular expressions
chunker = nltk.chunk.regexp.RegexpParser(grammar)
tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text))
all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent))
for tagged_sent in tagged_sents))
# join constituent chunk words into a single chunked phrase
candidates = [' '.join(word for word, pos, chunk in group).lower()
for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key]
return [cand for cand in candidates \
if cand not in stopwords \
and not all(char in punct for char in cand)]
def get_chunks(doc, N=10):
chunks = []
chunk_size = len(doc) // N
for i in range(N - 1):
chunks.append(doc[i * chunk_size:(i + 1) * chunk_size])
chunks.append(doc[(N - 1) * chunk_size:])
return chunks
# input doc string, chunk number of N
# output a list of evenly splited chunks
def get_chunk_counts(doc, N=10):
return [Counter(x) for x in np.array_split(doc, N)]
# input a phrase, and doc in chunks, return entropy
def get_entropy(phrase, chunks):
p = 0
tf_c = []
for chunk in chunks:
tf_c.append(chunk.count(phrase))
tf = sum(tf_c)
if tf == 0:
return 0
else:
for val in tf_c:
if val!= 0:
p += (-1) * (val / tf) * log10(val / tf)
return p
# input: one doc, list of phrases
# output: list of entropy, in given order of phrases
def get_entropy_doc(doc, phrases, N=10):
entropy = []
# split doc into N chunks
chunks = get_chunks(doc, N)
for phrase in phrases:
entropy.append(get_entropy(phrase, chunks))
return entropy
def get_phrase_entropy(doc, phrases, N=10):
entropy = {}
doc = preprocess(doc)
# split doc into N chunks
chunks = get_chunk_counts(doc, N)
for phrase in phrases:
p = 0
tf = sum([c[phrase] for c in chunks])
if tf == 0:
continue
for chunk in chunks:
tf_c = chunk[phrase]
if tf_c != 0:
p += (-1) * (tf_c / tf) * log10(tf_c / tf)
entropy[phrase] = p
return entropy
# remove ngrams that start and end with stopwords
def valid_ngram(ngram, noun_phrases):
if len(ngram) <= 1:
return False
grams = ngram.split()
if grams[0] in stopwords or grams[-1] in stopwords:
return False
# other heuristics for filtering go here...
if noun_phrases:
if ngram not in noun_phrases:
return False
return True
# learn vocabulary from list of docs and calculate first occurrence scores at the same time
# two kinds of criteria: noun phrase, heuristics
def learn_vocabulary(docs, only_noun_phrases=True):
first_occurrence_all = []
entropy_all = []
#docs = [doc.decode('utf8', 'ignore') for doc in docs]
'''
noun_phrases = set()
if only_noun_phrases:
for i, doc in enumerate(docs):
print "--extracting NP from doc", i
#doc = doc.decode('utf8', 'ignore')
noun_phrases.update([lemmatize(phrase) for phrase in extract_candidate_chunks(doc)])
with open('./semeval_train_docs_noun_phrases.set', 'w') as f:
pickle.dump(noun_phrases, f)
'''
print "loading pre-extracted set of noun_phrases"
noun_phrases = set()
with open('./semeval_train_docs_noun_phrases.set', 'r') as f:
noun_phrases = pickle.load(f)
vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
analyzer = vectorizer.build_analyzer()
vocab = set()
print "--learning vocabulary"
for i, doc in enumerate(docs):
print "--learning doc", i
first_occurrence = {}
entropy = {}
phrases = analyzer(doc) # all phrases from doc
doc = preprocess(doc)
doc_length = len(doc)
chunks = get_chunks(doc)
for i, phrase in enumerate(phrases):
if valid_ngram(phrase, noun_phrases) and phrase not in first_occurrence:
try:
pos = doc.find(phrase)
except ValueError:
print "--phrase: '{}' not found".format(phrase)
continue
first_occurrence[phrase] = pos / doc_length
# calculate entropy
entropy[phrase] = get_entropy(phrase, chunks)
vocab.add(phrase)
first_occurrence_all.append(first_occurrence)
entropy_all.append(entropy)
print "--size of vocabulary: ", len(vocab)
return vocab, first_occurrence_all, entropy_all
# input: list of docs as strings: ['doc 1 string', 'doc 2 string']
# output: tfidf matrix, each row a doc, each col a phrase, each cell a tfidf score
# list of vocabulary in the same order as features
# record of first occurrence of each valid ngram in each doc
def get_tfidf_matrix(docs):
vocab, first_occurrence_all, entropy_all = learn_vocabulary(docs)
vectorizer = TfidfVectorizer(decode_error='ignore', preprocessor=preprocess, ngram_range=(1, 3), tokenizer=tokenize)
print "--transforming tfidf matrix"
vectorizer.vocabulary = list(vocab)
X = vectorizer.fit_transform(docs)
# get list of phrases in the order of the feature vector
vocab_list = [phrase for phrase, idx in sorted(vectorizer.vocabulary_.items(), key=operator.itemgetter(1))]
assert(len(vocab_list) == X.shape[1])
return X, vocab_list, first_occurrence_all, entropy_all, vectorizer.idf_.tolist()
#return preprocessor, tokenizer, analyze
# Input: parameters to determine features
# Ouput: feature vector for a single keyphrase of size len(features)
def get_feature_vector(phrase, tfidf, first_occurrence, entropy):
#feature_vec = np.zeros((1, len(features)))
feature_vec = []
for f in features:
if f == 'tfidf':
feature_vec.append(tfidf)
elif f == 'first_occurrence':
feature_vec.append(first_occurrence)
elif f == 'entropy':
#if entropy == 0:
#print "phrase {} entropy 0".format(phrase)
feature_vec.append(entropy)
elif f == 'length':
feature_vec.append(len(phrase))
elif f == 'num_tokens':
feature_vec.append(len(phrase.split()))
return feature_vec
# input: tfidf_matrix, list of all phrases in vocab, set of all true keywords for each doc
# output: feature matrix (np.array): [[feature vector1], [feature vector2], ...], labels:[0, 1, ...]
def get_feature_matrix(tfidf_matrix, phrase_list, true_keys, first_occurrence, phrase_entropy):
#X = np.empty((0, len(features)))
#y = np.empty(0)
X = []
y = []
doc_tfidf_vecs = tfidf_matrix.toarray().tolist() # tfidf matrix
# lower true keywords
true_keys = [[preprocess(key) for key in key_list] for key_list in true_keys]
for doc_id, tfidf_vec in enumerate(doc_tfidf_vecs):
# traverse the doc vector
print "--extracting features from doc {}".format(doc_id)
for i, tfidf in enumerate(tfidf_vec):
if tfidf != 0: # Why is this case here?
feature_vec = get_feature_vector(phrase_list[i], tfidf, first_occurrence[doc_id][phrase_list[i]], phrase_entropy[doc_id][phrase_list[i]])
#X = np.append(X, feature_vec, axis=0)
X.append(feature_vec)
#if feature_vec[2] == 0:
#print "phrase {} entropy 0 in doc {}".format(phrase_list[i], doc_id)
label = lambda: 1 if phrase_list[i] in true_keys[doc_id] else 0
y.append(label())
#y = np.append(y, label())
return np.array(X), y
def get_candidates_for_docs(tfidf_matrix, phrase_list, true_keys, first_occurrence, entropy_all):
doc_tfidf_vecs = tfidf_matrix.toarray().tolist() # tfidf matrix
# lower true keywords
true_keys = [[key.lower() for key in key_list] for key_list in true_keys]
features_doc = []
labels_doc = []
phrase_idx_doc = []
for doc_id, tfidf_vec in enumerate(doc_tfidf_vecs):
#X = np.empty((0, len(features)))
#y = np.empty(0)
X = []
y = []
phrase_idx = []
# traverse the doc vector
print "--extracting features from doc {}".format(doc_id)
for i, tfidf in enumerate(tfidf_vec):
if tfidf != 0: # Why is this case here?
feature_vec = get_feature_vector(phrase_list[i], tfidf, first_occurrence[doc_id][phrase_list[i]], entropy_all[doc_id][phrase_list[i]])
#X = np.append(X, feature_vec, axis=0)
X.append(feature_vec)
label = lambda: 1 if phrase_list[i] in true_keys[doc_id] else 0
#y = np.append(y, label())
y.append(label())
phrase_idx.append(i)
features_doc.append(np.array(X))
labels_doc.append(y)
phrase_idx_doc.append(phrase_idx)
return features_doc, labels_doc, phrase_idx_doc, phrase_list
def get_vec_differences_train(X_vec, y_vec):
print "calculating vector difference"
X = np.empty((0, np.size(X_vec, axis=1)))
for i in range(len(X_vec)):
if y_vec[i] == 1:
for j in range(len(X_vec)):
print "difference", i, j
if y_vec[i] > y_vec[j]:
X = np.append(X, [X_vec[i] - X_vec[j]], axis=0)
y = np.ones(X.shape[0])
X = np.append(X, np.multiply(X, -1), axis=0)
y = np.append(y, np.zeros(y.shape[0]))
return X, y
# origin_idx is a matrix, first column is the index for first vector
# second column is the index for second vector
def get_vec_differences(X_vec, y_vec):
## old version
X = np.empty((0, np.size(X_vec, axis=1)))
y = np.empty(0)
origin_idx = np.empty((0,2))
for i in range(len(X_vec)):
if y_vec[i] == 1:
for j in range(len(X_vec)):
print "difference", i, j
if y_vec[i] == y_vec[j]:
continue
elif y_vec[i] > y_vec[j]:
X = np.append(X, [X_vec[i] - X_vec[j]], axis=0)
y = np.append(y, 1)
origin_idx = np.append(origin_idx, [i,j])
elif y_vec[i] < y_vec[j]:
X = np.append(X, [X_vec[i] - X_vec[j]], axis=0)
y = np.append(y, 0)
origin_idx = np.append(origin_idx, [i,j])
return X, y, origin_idx
# def construct_feature_vectors(train_docs):
# for i in range(len(train_docs)):
# # Dict of {index of gram : list of words in gram}
# train_grams = get_grams(train_docs[i])
# for g in train_grams:
# if g in train_keys: # train_keys needs to be normalized I think
# is_keyword = True
# else:
# is_keyword = False
# train_data.append(extract_features(g, train_grams, is_keyword))
# for i in range(len(test_docs)):
# # Dict of {index of gram : list of words in gram}
# test_grams = get_grams(test_docs[i])
# for g in test_grams:
# if g in test_keys: # test_keys needs to be normalized I think
# is_keyword = True
# else:
# is_keyword = False
# test_data.append(extract_features(g, test_grams, is_keyword))
# # train_vec should probably be the first element of test_data, and the label
# # can be the second element of test_data. both are output by extract_features
# # above in the loop