/
newsanalysis.py
340 lines (316 loc) · 12 KB
/
newsanalysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
from collections import defaultdict as dd
import datetime as dt
import newsutils as nu
import textutils as tu
import os
import sys
import re
from itertools import chain
import pymongo as pm
boul = lambda x: 1 if x else 0
tcmp = lambda x,y: -1 if x[1] < y[1] else boul(y[1] < x[1])
strtodate = lambda date: dt.datetime.strptime(date, '%Y%m%d')
datetostr = lambda date: dt.datetime.strftime(date, '%Y-%m-%d')
def find_edges_with_entity(entity, unique = False, filehandle='', dbhandle=''):
'''Returns all edges in the keywords graph that contain the entity.
@param entity
@param unique if True, return set of distinct co-occurring keywords
'''
if filehandle:
all_edges = [l.strip() for l in open(filehandle).readlines()]
edges_with_entity = [edge.split(',') for edge in all_edges\
if edge.startswith(entity)]
edges_with_entity.extend([edge.split(',') for edge in all_edges\
if edge.split(',')[1].startswith(entity)])
elif dbhandle:
kwe = dbhandle['edges']
edges_with_entity = [{'keyword1': edge['keyword2'],\
'keyword2': edge['keyword1'],\
'source': edge['source'],\
'pid': edge['pid'],\
'date': edge['date']}\
for edge in\
kwe.find({'keyword2': re.compile(entity)})]
edges_with_entity.extend([{'keyword1': edge['keyword1'],\
'keyword2': edge['keyword2'],\
'source': edge['source'],\
'pid': edge['pid'],\
'date': edge['date']}\
for edge in\
kwe.find({'keyword1': re.compile(entity)})])
if unique:
edges_with_entity = set(edges_with_entity)
else:
edges_with_entity = []
return edges_with_entity
def test_find_edges_with_entity_dbhandle():
'''DB tests.'''
db_conn = pm.Connection()
dbhandle = db_conn['news']
# non-unique
coen = find_edges_with_entity(entity='irs', dbhandle=dbhandle)
coen_hist = dd(int)
for e in coen: coen_hist[e['keyword2']] += 1
coen_hist = coen_hist.items()
coen_hist.sort(tcmp)
print '\n'.join(['%s: %d' % (item[0], item[1]) for item in coen_hist])
def test_find_edge_timeseries_dbhandle():
'''DB tests for returning timeseries containing entity.'''
db_conn = pm.Connection()
dbhandle = db_conn['news']
edges_with_entity = find_edges_with_entity(entity='immigration', dbhandle=dbhandle)
edge_timeseries = [(edge['source'], edge['date'],\
edge['keyword1'], edge['keyword2'])\
for edge in edges_with_entity]
edge_timeseries.sort(tcmp)
print '\n'.join([','.join([str(i.encode('utf8')) for i in edge])\
for edge in edge_timeseries])
def plot_cooccurring_entitytimeseries_dbhandle():
db_conn = pm.Connection()
dbhandle = db_conn['news']
def plot_multiple_entitytimeseries_dbhandle(entities = [], output_file = 'immigration.html'):
'''Plots timeseries for multiple entities.
'''
db_conn = pm.Connection()
dbhandle = db_conn['news']
from nvd3 import lineChart
chart = lineChart(name='lineChart', height=400, width=700, date=True)
for entity in entities:
edges_with_entity = find_edges_with_entity(entity=entity, dbhandle=dbhandle)
ts_dict = dd(int)
for e in edges_with_entity: ts_dict[strtodate(e['date']).strftime('%s')] += 1
ts_list = [(ts_dict[k], int(k)*1000) for k in ts_dict]
ts_list.sort(tcmp)
xdata = [i[1] for i in ts_list]
ydata = [i[0] for i in ts_list]
extra_serie = {"tooltip": {"y_start": "", "y_end": " mentions"}}
chart.add_serie(name=entity, y=ydata, x=xdata, extra=extra_serie)
of = open(output_file, 'w')
chart.buildhtml()
of.write(chart.htmlcontent)
of.close()
def test_plot_multiple_entitytimeseries_dbhandle():
plot_multiple_entitytimeseries_dbhandle(['oklahoma', 'sandy'], output_file='viz.html')
def find_cooccurring_entities(entity1, entity2, filehandle = '', dbhandle = ''):
'''Finds edges with (entity1, entity2) or (entity2, entity1).
Returns list of (entity1, entity2, source, date, pid) tuples.
'''
if filehandle:
pass
elif dbhandle:
kwe = dbhandle['edges']
articles = dbhandle['articles']
matching_edges = [(item['keyword1'], item['keyword2'],\
item['source'], item['date'], item['pid'],\
articles.find_one({'pid': long(item['pid'])})['title'])\
for item in kwe.find({'$or': [{'keyword1': re.compile(entity1),\
'keyword2': re.compile(entity2)},\
{'keyword2': re.compile(entity1),\
'keyword1': re.compile(entity2)}]})]
else:
matching_edges = []
return matching_edges
def test_find_cooccurring_entities():
db_conn = pm.Connection()
dbhandle = db_conn['news']
edges = find_cooccurring_entities(entity1='irs', entity2='rubio',\
dbhandle=dbhandle)
print '\n'.join([','.join(item) for item in edges])
def find_entity_timeseries(entity, filehandle = '', dbhandle = ''):
'''Retrieves a chronological list of entities. An element in the timeseries
comprises id, time, source. Items are sorted in order of (date, pid)
@param entity
@param filehandle
@param dbhandle
'''
entity = tu.strip_whitespace(entity.lower())
edges_with_entity = find_edges_with_entity(entity, filehandle=filehandle, dbhandle = dbhandle)
timeseries = []
for edge in edges_with_entity:
date, source, pid = edge[2], edge[3], edge[4]
date = strtodate(date)
pid = int(pid)
timeseries.append((entity, date, pid, source))
timeseries = list(set(timeseries)) # unique
timeseries.sort(tcmp) # TODO: sort by pid also
return timeseries
def test_find_entity_timeseries():
ts = find_entity_timeseries('immigration', filehandle = 'total.csv')
print '\n'.join(['immigration:%s, %d, %s' % (datetostr(item[1]), item[2], item[0])\
for item in ts])
def find_numerical_sentences(precontext=1, postcontext=1, dbhandle = '', filename = '', pid = '', date='', source=''):
'''Retrieves all sentences with numerical values in them, also provides\
precontext and postcontext.
@param precontext number of sentences before numerical sentence.
@param postcontext number of sentences after numerical sentence.
@param filename
@param dbhandle
'''
sentences = tu.get_sentences(tu.get_body(html=open(filename).read()))
npat = '[0-9]+\.?[0-9]+'
numerical_sentences = []
for l in enumerate(sentences):
numbers = re.findall(npat, l[1])
context = {}
if numbers:
context['sentence'] = l[1]
#TODO: [{entity:(number, measurement unit (%, km, Rs?), category (time?))}]
context['num'] = []
context['prolog'] = [sentences[l[0]-1-b] for b in range(precontext)\
if (l[0]-1-b)]
context['epilog'] = [sentences[l[0]+1+a] for a in range(postcontext)\
if (l[0]+1+a < len(sentences))]
numerical_sentences.append(context)
if dbhandle:
trends_coll = dbhandle['trends']
trends_coll.insert({'date': date, 'source': source, 'pid': pid,\
'numbers': numerical_sentences})
return numerical_sentences
# return [(l[1],\
# [sentences[l[0]-1-b] for b in range(precontext) if (l[0]-1-b)],
# [sentences[l[0]+1+a] for a in range(postcontext) if (l[0]+1+a < len(sentences))])\
# for l in enumerate(sentences) if re.findall(npat, l[1])]
def test_find_numerical_sentences():
# boundary conditions
# db
# sanity
numerical_sentences = find_numerical_sentences(filename='foo.html')
for i in numerical_sentences:
if (len(i['prolog']) & len(i['epilog'])):
print 'num: %s\nBEFORE: %s\nAFTER: %s\n' %\
(i['sentence'], i['prolog'][0], i['epilog'][0])
else:
print 'num: %s\n' % i['sentence']
def find_frequent_coentities(entity, threshold = 50, filehandle = '', dbhandle = ''):
'''Retrieves commonly occurring entities along with a given entity.
@param entity
@param threshold how many coentities to return
@param conditions list of boolean functions to apply. each elem is (fun, args)
@param filehandle
@param dbhandle
'''
entity = tu.strip_whitespace(entity.lower())
coentities = dd(int)
if filehandle:
#assume CSV (k1, k2, date, id)
all_edges = [l.strip() for l in open(filehandle).readlines()]
edges_with_entity = [edge.split(',') for edge in all_edges\
if edge.startswith(entity)]
edges_with_entity.extend([edge.split(',') for edge in all_edges\
if len(edge.split(',')) &\
edge.split(',')[1].startswith(entity)])
for edge in edges_with_entity:
if (edge[0].startswith(entity)):
coentities[edge[1]] += 1
else:
coentities[edge[0]] += 1
elif dbhandle:
edges_with_entity = find_edges_with_entity(entity, dbhandle=dbhandle,\
unique=False)
for edge in edges_with_entity:
if edge['keyword1'].startswith(entity):
coentities[edge['keyword1']] += 1
else:
coentities[edge['keyword2']] += 1
pass
else:
return None
coentities = coentities.items()
coentities.sort(tcmp)
coentities.reverse()
return coentities[:(threshold+1)]
def test_find_frequent_coentities():
# file, no db
coentities = find_frequent_coentities(entity='economic policy', filehandle='total.csv')
# timeseries
ts = find_entity_timeseries(entity='economic policy', filehandle='total.csv')
for entity in coentities:
ts.extend(find_entity_timeseries(entity=entity[0], filehandle='total.csv'))
ts.sort(tcmp)
print '\n'.join(['%s,%s,%d,%s' %\
(item[0], datetostr(item[1]), item[2], item[3])\
for item in ts])
# print '\n'.join(['%s:%d' % (e[0], e[1]) for e in coentities])
def update_pid_db(news_sources = [], logfile = '/tmp/dbstate.txt'):
'''Updates the pid-title db with articles downloaded since the last state.
'''
main_path = '/home/shankar/work/data/news/'
if not(news_sources):
news_sources = os.listdir(main_path)
db_conn = pm.Connection()
dbhandle = db_conn['news']
for source in news_sources:
source_state = dbhandle['state'].find_one({'source': source})
if (source_state):
if source_state.has_key('last_logged_line'):
last_logged_line = source_state['last_logged_line']
else:
last_logged_line = 0
else:
last_logged_line = 0
logfile = [f for f in os.listdir(main_path + source + '/')\
if f.endswith('.log.txt')][0]
loglines = open(main_path + source + '/' + logfile).readlines()
if len(loglines) > last_logged_line:
for line in loglines[last_logged_line:]:
fields = line.strip().split('\t')
try:
pid = int(fields[0])
date = dt.datetime.strptime(fields[1], '%a %b %d %H:%M:%S %Y')
date = dt.datetime.strftime(date, '%Y%m%d')
title = fields[2]
dbhandle['articles'].insert({'source': source,\
'pid': pid,\
'date': date,
'title': title})
except:
continue
if not(dbhandle['state'].find({'source': source}).count()):
dbhandle['state'].insert({'source': source,\
'last_logged_line': len(loglines)})
else:
dbhandle['state'].update({'source': source},\
{"$set": {'last_logged_line': len(loglines)}})
db_conn.close()
def macro_get_articles_from_db(keyword=''):
'''Macro function only to be used from command line. Returns a
chronological-sorted list of (date,pid,source,title) items from articles
containing keyword occurring either as keyword or as title.
@param keyword
'''
if keyword:
c = pm.Connection()
db = c.news
edges = db.edges
articles = db.articles
# find pids of articles containing keywords
l_pids = [item['pid'] for item in edges.find({'$or': \
[{'keyword1': re.compile(keyword)},\
{'keyword2': re.compile(keyword)}]}
)]
l_pids.extend([item['pid'] for item in articles.find({'title':\
re.compile(keyword, re.IGNORECASE)
})])
l_pids = set(l_pids)
a_pids = [articles.find_one({'pid': int(pid)}) for pid in l_pids]
l = [(i['date'], i['pid'], i['source'], i['title']) for i in a_pids if i]
l.sort()
print '\n'.join(['%s,%d,%s: %s' % i for i in l])
return l
else:
print 'No keyword given.'
pass
if __name__ == '__main__':
#sys.settrace(build_keyword_graph_single_folder)
#test_build_keyword_graph_single_folder()
#build_keyword_graph_single_folder(path='/home/shankar/work/data/news/reuters/', of='reuters-graph.csv')
#test_find_frequent_coentities()
#test_find_entity_timeseries()
#init_keyword_db_wrapper()
#update_keyword_db()
#test_find_edges_with_entity_dbhandle()
#test_find_edge_timeseries_dbhandle()
#test_find_cooccurring_entities()
#test_plot_multiple_entitytimeseries_dbhandle()
#update_pid_db(news_sources=['firstpostin', 'reuters', 'nytimes'])
test_find_numerical_sentences()