/
general_functions.py
271 lines (242 loc) · 9.15 KB
/
general_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
# -*- coding: utf-8 -*-
"""
general_functions.py
General functions used within project
"""
def load_pickle(filename, silent = False):
"""
Loads pickle and prints to screen
"""
import pickle
if not silent:
print "Loading pickle (%s)" %(filename)
try:
return pickle.load( open( filename, "rb" ) )
except:
print "Error loading pickle."
def save_pickle(data, filename, silent = False):
"""
Saves pickle and prints to screen
"""
import pickle
if not silent:
print "Saving pickle (%s)" %(filename)
pickle.dump( data, open( filename, "wb" ) )
def write_to_python_module(data, variable_name, filename, silent = False):
"""
Saves a variable to a python module
"""
f = open(filename, 'w')
f.write('import datetime\n')
f.write('%s = %s' %(variable_name, str(data)))
f.close()
if not silent:
print "Saving file (%s)" %(filename)
def export_dict_to_json(data, path):
"""
Write a python element to JSON
"""
import json
j = json.dumps(data)
with open(path, 'w') as file_to_write:
file_to_write.write(j)
print "Exported JSON (%s)" %(path)
def convert_str_to_date(string):
"""
Converts a string of format u'2013-06-24T23:06:02Z' into datetime.date (ignores time)
"""
from time import mktime, strptime
from datetime import datetime
struct = strptime(string[0:10], "%Y-%m-%d")
dt = datetime.fromtimestamp(mktime(struct))
return dt
def convert_datetime_to_str(dt):
"""
Converts datetime to str in format 2014-7-4
"""
import datetime
return "%s-%s-%s" %(dt.year, dt.month, dt.day)
def create_REST_request(main, parameters = []):
"""
Appends a list of parameters to a URL separating all with & sign.
Output something like this:
http://beta.content.guardianapis.com/world?api-key=explorer&page-size=76&order-by=oldest&use-date=published&show-tags=keyword&show-elements=all&show-story-package=true&show-related=true&show-most-viewed=true&show-fields=standfirst%2Cthumbnail%2Cbyline%2Cheadline%2Cbody
"""
for p in parameters:
main += p+"&"
print "Generated the following request: %s" %(main[:-1])
return main[:-1]
def create_stopword_list(extra_words):
"""
Creates stopword list (adds extra words to original English set)
"""
from sklearn.feature_extraction.text import TfidfVectorizer
original = list(TfidfVectorizer.get_stop_words(TfidfVectorizer(stop_words='english')))
if extra_words:
return frozenset(original+extra_words)
else:
return frozenset(original)
def convert_dict_to_list(dictionary):
return [dictionary[key] for key in dictionary]
def find_articles_by_tag(articles, tag_list, all_tags_required=True):
"""
Given articles (list of dict) and list of tags, export list of articles which match all tags or any tag
"""
return_list = []
# Convert articles to list if they are currently in dict form
if type(articles)== dict:
articles = convert_dict_to_list(articles)
# Convert a string into a list, assumes the string is one tag
if type(tag_list) == str:
tag_list = [tag_list]
print "Tag list was provided as a string: assuming just one tag. Please supply in list format if more than one tag."
# Loop through articles and add each one if it meets conditions
for a in articles:
try:
article_tags = a['tags']
except:
print "ERROR - this should be an article:"
print a
number_of_tags_provided = len(tag_list)
crossover = [x in article_tags for x in tag_list] # results in list of [True, False, True, True] for tags provided
if all_tags_required:
if sum(crossover) == number_of_tags_provided:
return_list.append(a)
else:
if sum(crossover) > 0:
return_list.append(a)
return return_list
def print_current_status():
"""
Print current status on path defined by options.current_articles_path
"""
import options
reload(options)
print "======"
print "Status for %s" %options.current_articles_path
print "======"
articles = load_pickle(filename = options.current_articles_path)
cosine_similarities = load_pickle(filename = options.current_articles_path_cosine_similarites)
# Number of articles crawled
print "\nArticles crawled:", len(articles)
# Date range for articles
print "\nFirst article date:", min([articles[x]['date'] for x in articles])
print "Latest article date:", max([articles[x]['date'] for x in articles])
# Number of articles analysed for cosine similarity
print "\nCosine similarities calculated:", len(cosine_similarities)
# Number of articles by month
dates = [str(articles[x]['date'].year)+'-'+str(articles[x]['date'].month).zfill(2) for x in articles]
dates_unique = list(set(dates))
dates_unique.sort()
print "\nArticles crawled by month"
for d in dates_unique:
print "%s: %s articles" %(d, dates.count(d))
# Number of World articles by month
dates = [str(articles[x]['date'].year)+'-'+str(articles[x]['date'].month).zfill(2) for x in articles if 'World news' in articles[x]['tags']]
dates_unique = list(set(dates))
dates_unique.sort()
print "\nWORLD articles crawled by month"
for d in dates_unique:
print "%s: %s articles" %(d, dates.count(d))
# Number of UK articles by month
dates = [str(articles[x]['date'].year)+'-'+str(articles[x]['date'].month).zfill(2) for x in articles if 'UK news' in articles[x]['tags']]
dates_unique = list(set(dates))
dates_unique.sort()
print "\nUK articles crawled by month"
for d in dates_unique:
print "%s: %s articles" %(d, dates.count(d))
# FB shares pulled
dates = [str(articles[x]['date'].year)+'-'+str(articles[x]['date'].month).zfill(2) for x in articles if type(articles[x]['facebook']['snapshot_of_total'])==int]
dates_unique = list(set(dates))
dates_unique.sort()
print "\nFB shares crawled by month"
for d in dates_unique:
print "%s: %s articles" %(d, dates.count(d))
def search_guardian_by_query(
query,
articles,
pickle_or_database = 'pickle',
start_date='2012-01-01',
end_date='2015-01-01',
section='world',
guardian_page_size=100,
return_number=10):
"""
Given query, a start date string, an end date string, a section (i.e. 'world')
Also given articles dict of all articles (to check if we have each result saved)
e.g. load_pickle('../open/data/articles.p')
Returns list of articles (and their headlines, etc.) for which we have data saved
Format is:
[{'date': u'2014-02-13T18:47:52Z',
'headline': u'Execution of Arab Iranian poet Hashem Shaabani condemned by rights groups',
'id': u'world/2014/feb/13/iran-middleeast',
'standfirst': u'Poet and leading member of banned cultural organisation run by Ahwazi Arab minority reported hanged after public confession'},
{'date': u'2014-02-18T19:21:00Z',
'headline': u"Iran won't discuss military programme, say officials",
'id': u'world/2014/feb/18/mohammad-javad-zarif-iran-political-will-final-nuclear-agreement',
'standfirst': u"Iran's foreign minister and senior negotiator say only nuclear issues will be discussed as next round of talks begins in Vienna"}]
Process:
Forms query string
Requests a search from Guardian API and receives results
Filters results for ones that we have within our database/pickle
Picks out X (return_number) according to relevance ordering from Guardian
Reorder by date
"""
from urllib import quote_plus as clean_url
import urllib2
import json
# Query format
# "http://content.guardianapis.com/search?api-key=test&page-size=100&q=germany%20football§ion=world&from-date=2014-01-01&to-date=2014-10-27"
main = 'http://content.guardianapis.com/search?'
parameters = [
'api-key=uu4qhyqqknrkhrsmahfwr2qs',
'page-size=%s' %guardian_page_size,
'q=%s' %clean_url(query),
'from-date=%s' %start_date,
'to-date=%s' %end_date,
# 'show-fields=standfirst'
]
if section:
parameters.append('section=%s' %section)
request = create_REST_request(main, parameters)
response = urllib2.urlopen(request).read()
page_data = json.loads(response);
try:
article_set = page_data['response']['results']
except:
print "Failed to retrieve result set."
return None
print "Received %s results from Guardian." %page_data['response']['total']
print article_set
# Check if each article is in articles pickle / in database (TODO: currently pickle)
if pickle_or_database == 'pickle':
filtered_article_set = [x for x in article_set
# Only show if in articles file
if x['id'] in articles
# Only show if 2+ articles in future_articles list
if len(articles[x['id']]['f'])>=2]
elif pickle_or_database == 'database':
# TODO
None
print "\nThese are the articles we have saved data for previously:"
print filtered_article_set
# Reduce to the number that we want to return to user, if we have more than this
short_article_set = filtered_article_set[0:return_number]
# Sort by date
sorted_article_set = sorted(short_article_set, key=lambda x: x['webPublicationDate'])
print "\nThis is the article set sorted by date:"
print sorted_article_set
# Return in correct format
return [{
'id':x['id'],
'headline':x['webTitle'].replace('&','&').replace(''',"'").replace('"','"'),
'date':x['webPublicationDate']
} for x in sorted_article_set]
# cosine_similarity_matrix = load_pickle('../open/data/articles_cosine_similarities.p')
# r = search_guardian_by_query(
# query = 'china democracy',
# cosine_similarity_matrix = cosine_similarity_matrix,
# start_date='2013-01-01',
# end_date='2014-03-01',
# guardian_page_size=100
# )