/
analysis.py
420 lines (364 loc) · 15.7 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
'''
Topic Analysis functionality for the Always Remember project.
Dan Morris 11/3/14 - 11/20/14
'''
import pickle
import numpy as np
import pandas as pd
from mongo_stuff import just_clean_text
from collections import Counter
import simplejson as json
class TopicAnalyzer(object):
'''
Loads the vectorizer and H matrix necessary for document topic analysis.
Performs large-scale analysis on the corpus.
INPUT: filelike - vec_file vectorizer, filelike - H topic-term matrix,
np array - topic_filter booleans
'''
def __init__(self, vec_file, H_file, topic_filter=None):
self.vectorizer = pickle.load(open(vec_file))
self.topic_filter = topic_filter
H = pickle.load(open(H_file))
if topic_filter is not None:
self.H = H[topic_filter]
else:
self.H = H
self.num_topics = self.H.shape[0]
def topic_freq_by_date_range(self, table, start_date, end_date,
n_articles=1, topic_freq_threshold=.1):
'''
Get topic frequencies for all records in a date range. Also returns
the highest-matching document(s) if that topic's relative
frequency is above the topic_freq_threshold.
INPUT: mongo-collection - table, string - start_date,
string - end_date, int - n_articles,
float - topic_freq_threshold
OUTPUT: list - (topic index, topic frequency, example
article(s)) tuples
'''
q = {'pub_date': {'$gte': start_date, '$lte': end_date}}
docs = just_clean_text(table, q)
article_ids = np.array([d[0] for d in docs])
X = self.vectorizer.transform([d[1] for d in docs])
doc_topic_freqs = X.dot(self.H.T)
total_topic_freqs = _normalize_frequencies(doc_topic_freqs.sum(axis=0))
output = [None] * self.num_topics
for t in range(self.num_topics):
if total_topic_freqs[t] > topic_freq_threshold:
tops = np.argsort(doc_topic_freqs[:, t])[::-1][:n_articles]
output[t] = (t, total_topic_freqs[t], article_ids[tops])
else:
output[t] = (t, total_topic_freqs[t], None)
return output
def topic_count_by_date_range(self, table, start_date, end_date,
doc_topic_threshold=.1,
only_best_match=True):
'''
Returns a count of articles that match each topic above a certain
threshold of similarity. More granular and human-interpretable
than topic_freq_by_date_range. If only_best_match: counts
articles for which that topic is the best match. Else: counts
any article above that threshold per topic.
INPUT: mongo-collection - table, string - start_date,
string - end_date, float - doc_topic_threshold,
bool - only_best_match
OUTPUT: np array - count of matching articles per topic
'''
q = {'pub_date': {'$gte': start_date, '$lte': end_date}}
docs = just_clean_text(table, q)
article_ids = np.array([d[0] for d in docs])
texts = [d[1] for d in docs]
article_lengths = _get_article_lengths(texts)
X = self.vectorizer.transform(texts)
doc_topic_freqs = X.dot(self.H.T) / article_lengths
if only_best_match:
best_matches = Counter(doc_topic_freqs.argmax(axis=1))
return np.array([best_matches[i] for i in range(self.num_topics)])
matches = doc_topic_freqs > doc_topic_threshold
return matches.sum(axis=0)
def current_events_analysis(self, table, n_days=7):
'''
Finds just articles from the last n_days for special analysis/output
'''
#TODO
pass
def empire_plot_counts(self, table, start_date='2001-10',
end_date='2014-11', verbose=False, **kwargs):
'''
Gets topic frequencies for every month in range. Output designed
to build a stacked area chart.
INPUT: mongo-collection - table, string - start_date,
string - end_date, bool - verbose,
**kwargs for topic_freq_by_date_range
OUTPUT: dict - freq_table of topic counts keyed by year-month
'''
# build date list
dates = [start_date]
while dates[-1] != _next_month(end_date):
dates.append(_next_month(dates[-1]))
freq_table = {d: [0] * self.num_topics for d in dates}
for d in range(len(dates) - 1):
if verbose:
print 'getting frequencies for ', dates[d]
freq_table[dates[d]] = self.topic_count_by_date_range(table,
dates[d], dates[d+1], **kwargs)
return freq_table
def bake_empire_csv(self, freq_table, csv_file, topic_names=None):
'''
Creates a CSV from the empire_plot_counts output. Easy to plug
into D3 viz!
INPUT: dict - freq_table, filepath - csv_file, list - topic_names
OUTPUT: None
'''
df = pd.DataFrame.from_dict(data=freq_table, orient='index').sort()
#TODO: bake in topic names!
df.to_csv(open(csv_file, 'w'), index_label='date')
def store_topic_weights(self, table, model_name, normalize='linear',
min_doc_length=None, verbose=False):
'''
Calculates topic weights for each record in the table, storing them
back into the record for easy future access. Normalize takes
word count into account:
'linear' - divide by word count
'sqrt' - divide by sqrt of word count
'none' - don't normalize
INPUT: mongo-collection - table, string - model_name,
string - normalizing rule, boolean - verbose
OUTPUT: None
'''
query = {'clean_text': {'$exists': True, '$ne': ''},
model_name: {'$exists': False}}
cursor = table.find(query)
i = 0
for record in cursor:
if verbose:
i += 1
if i % 500 == 0:
print 'updating topics for record ', i
# push through model to get weights
doc = record['clean_text']
L = len(doc.split())
if min_doc_length is not None and L < min_doc_length:
continue
x = self.vectorizer.transform([doc])
dtf = x.dot(self.H.T)
# normalize
if normalize == 'linear':
dtf /= L
elif normalize == 'sqrt':
dtf /= np.sqrt(L)
# store weights
table.update({'_id': record['_id']},
{'$set': {model_name: list(dtf[0])}})
def smooth_time_series(table, model_name, topic_names, output_csv,
ranked=True, rank_number=3, topic_threshold=.001,
month_interval=3, normalize=False):
'''
Time-series topic analysis; counts articles per topic-month which either:
1) are in the n highest-ranked topics for an article
2) exceed the given threshold
Computes a rolling mean of some number of months, and produces a CSV
which can be plugged into the D3 front-end for visualizing these
trends over time.
Normalize divides each time-series by the total number of articles per
month to get relative frequency rather than count.
INPUT: mongo-collection - table, string - model_name, list - topic_names,
string - output_csv, bool - ranked, int - rank_number,
float - topic_threshold, int - month_interval, bool - normalize
OUTPUT: None
'''
startmonth = 10 - month_interval
query = {model_name: {'$exists':True},
'pub_date': {'$gt': '2001-0' + str(startmonth)},
'type_of_material':'News'}
n = table.find(query).count()
num_topics = len(table.find_one(query)[model_name])
cursor = table.find(query)
ids = [None] * n
pubdates = [None] * n
weights = np.zeros((n, num_topics))
for i, record in enumerate(cursor):
ids[i] = record['_id']
pubdates[i] = record['pub_date'][:10]
weights[i] = record[model_name]
if ranked:
tops = np.argsort(weights, axis=1)[:,-rank_number:]
bw = np.zeros(weights.shape)
for i, row in enumerate(tops):
for j in row:
bw[i, j] = 1
bdf = pd.DataFrame(bw, index=pd.DatetimeIndex(pubdates))
else:
bdf = pd.DataFrame(weights > topic_threshold,
index=pd.DatetimeIndex(pubdates))
bts = [None] * num_topics
offset = pd.offsets.Week(month_interval * 2)
if normalize:
# determine articles per month for scaling
all_articles_ts = pd.TimeSeries(1, pd.DatetimeIndex(sorted(pubdates)))
abm = pd.rolling_mean(all_articles_ts.resample('M', how='count'),
month_interval)
for i in range(num_topics):
bts[i] = pd.rolling_mean(pd.TimeSeries(data=bdf[i],
index=bdf.index).resample('M', how='sum'),
month_interval) / abm
bts[i].index = bts[i].index - offset
else:
for i in range(num_topics):
bts[i] = pd.rolling_mean(pd.TimeSeries(data=bdf[i],
index=bdf.index).resample('M', how='sum'),
month_interval)
bts[i].index = bts[i].index - offset
outputdf = pd.concat([s for s in bts], axis=1).fillna(0)
outputdf.columns = topic_names
outputdf.to_csv(output_csv, index_label='date')
def get_best_articles_overall(table, model_name, topic_names,
start_date='2001-09', end_date='2014-11',
top_count=25):
'''
Finds the highest-weighed articles for each topic using a specified
model. Returns a dict for further processing.
INPUT: mongo-collection - table, string - model_name,
list - topic_names, string - start_date, string - end_date,
int - top_count
OUTPUT: dict - lists of article ids keyed by topic
'''
query = {model_name: {'$exists':True}, 'type_of_material':'News',
'pub_date': {'$gt': start_date, '$lt': end_date}}
num_topics = len(topic_names)
N = table.find(query).count()
bests = {}
article_ids = [None] * N
article_weights = np.zeros((N, num_topics))
cursor = table.find(query)
for i, record in enumerate(cursor):
article_ids[i] = record['_id']
article_weights[i] = record[model_name]
article_ids = np.array(article_ids)
for i, topic in enumerate(topic_names):
best_ids = np.argsort(article_weights[:, i])[:-1-top_count:-1]
bests[topic] = article_ids[best_ids]
return bests
def compile_overall_best_article_json(table, model_name, best_articles,
topic_names, outputfile):
'''
Takes best_articles dict from get_best_articles_overall, gets extra
article information from the table, and creates a JSON file that
the D3 front-end can use to display tooltip articles.
INPUT: mongo-collection - table, string - model_name,
dict - best_articles, list - topic_names, string - outputfile
OUTPUT: None
'''
num_topics = len(topic_names)
topic_dict = {name:[] for name in topic_names}
for topic, articles in best_articles.iteritems():
for i, a in enumerate(articles):
record = table.find_one({'_id': a})
d = {'pub_date': record['pub_date'][:10],
'lead_paragraph': record['lead_paragraph'],
'headline': record['headline'],
'web_url': record['web_url']}
topic_dict[topic].append(d)
json.dump(topic_dict, open(outputfile, 'w'))
def get_best_articles_per_month(table, model_name, start_date='2001-09',
end_date='2014-11', verbose=False):
'''
Finds the highest-weighed article every month for each topic,
using a specified model. Returns a dict for further processing.
INPUT: mongo-collection - table, string - model_name,
string - start_date, string - end_date, bool - verbose
OUTPUT: dict - best_articles keyed by month
'''
query = {model_name: {'$exists':True}, 'type_of_material':'News'}
num_topics = len(table.find_one(query)[model_name])
dates = [start_date]
while dates[-1] != _next_month(end_date):
dates.append(_next_month(dates[-1]))
best_articles = {}
for d in range(len(dates) - 1):
if verbose:
print 'selecting best articles for ', dates[d]
best_this_month = [(None, 0.0)] * num_topics
query['pub_date'] = {'$gte': dates[d], '$lt': dates[d + 1]}
cursor = table.find(query)
for record in cursor:
w = record[model_name]
for i, v in enumerate(w):
if v > best_this_month[i][1]:
best_this_month[i] = (record['_id'], v)
best_articles[dates[d]] = best_this_month
return best_articles
def compile_best_article_json(table, model_name, best_articles, topic_list,
outputfile):
'''
Takes best_articles dict from get_best_articles_per_month, gets extra
article information from the table, and creates a JSON file that
the D3 front-end can use to display tooltip articles.
INPUT: mongo-collection - table, string - model_name,
dict - best_articles, list - topic_list, string - outputfile
OUTPUT: None
'''
num_topics = len(topic_list)
topic_dict = {i:[] for i in range(num_topics)}
for month, topics in best_articles.iteritems():
for i, t in enumerate(topics):
record = table.find_one({'_id': t[0]})
d = {'pub_date': record['pub_date'][:10],
'lead_paragraph': record['lead_paragraph'],
'headline': record['headline'],
'web_url': record['web_url'],
'weight': t[1],
'_id': t[0],
'weights_sum': sum(record[model_name])}
topic_dict[i].append(d)
for i, t in enumerate(topic_list):
topic_dict[t] = topic_dict.pop(i)
json.dump(topic_dict, open(outputfile, 'w'))
def filter_best_article_json(filename=None, jsond=None, threshold=.001):
'''
Filters out articles below the threshold. Input either a filename for
a JSON file or the json dictionary itself.
INPUT: string - filename, dict - jsond, float - threshold
OUTPUT: dict - filtered json
'''
if filename is not None:
baj = json.load(open(filename))
else:
baj = jsond
filtered_baj = {k: [] for k in baj.keys()}
for k, L in baj.iteritems():
high_enough = []
for article in L:
if article['weight'] >= threshold:
high_enough.append(article)
filtered_baj[k] = high_enough
return filtered_baj
def _normalize_frequencies(f):
'''
Normalizes and returns array f so that it sums to 1.
'''
return f / sum(f)
def _get_article_lengths(docs):
'''
Determines the length of each document in docs for normalizing TFIDF
INPUT: list length n - documents
OUTPUT: n x 1 np array - length of docs
'''
L = np.zeros((len(docs), 1))
for i, d in enumerate(docs):
L[i] = len(d.split())
return L
def _next_month(d):
'''
Given a year-month string, returns a string for the next month.
INPUT: string - d ('YYYY-MM')
OUTPUT: string - d ('YYYY-MM')
'''
y = int(d[:4])
m = int(d[-2:])
if m == 12:
return str(y + 1) + '-01'
elif m < 9:
return str(y) + '-0' + str(m + 1)
else:
return str(y) + '-' + str(m + 1)