-
Notifications
You must be signed in to change notification settings - Fork 0
/
cron.py
103 lines (84 loc) · 2.9 KB
/
cron.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import datetime
import json
import re
import sys
import time
import urllib
import urllib2
import summarize as sz
import re
import constants
def get_twfy_birthday(mp_extra_info):
sys.stdout.write('.')
sys.stdout.flush()
try:
dob_tuple = time.strptime(mp_extra_info["date_of_birth"], '%Y-%m-%d')
dob = datetime.datetime(*dob_tuple[:3])
return mp_extra_info["date_of_birth"]
except (ValueError, KeyError):
pass
return None
def get_wiki_birthday(mp_extra_info):
try:
wiki_url = mp_extra_info['wikipedia_url']
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0')]
infile = opener.open(wiki_url)
page = infile.read()
m = re.search('<td.*?Date of birth</td>\n<td>(.*?)</td>', page)
dob_str = m.group(1)
dob_tuple = time.strptime(dob_str, "%d %B %Y")
dob = dob_str
return dob
except (ValueError, KeyError, AttributeError):
pass
return None
MAX_LEN = 1000
def mp_sz(wiki_url):
def strip_wiki_extras(sent):
return re.sub("(\s\(born.*?\d{4}\))|(\[.*?\])|(^\s)", "", sent)
def abridge(sent):
i = MAX_LEN
while i < len(sent):
if sent[i] == " ":
return sent[:(i + 1)] + "..."
i += 1
return sent
if wiki_url:
sum_arr = sz.summarize_page(wiki_url).summaries
sum_clean_arr = [strip_wiki_extras(s) for s in sum_arr]
summary = " ".join(sum_clean_arr)
if len(summary) > MAX_LEN:
summary = abridge(summary)
return summary
else:
return None
url = urllib2.urlopen("http://www.theyworkforyou.com/api/getMPs?" + urllib.urlencode({
"key": constants.TWFY_API_KEY,
}))
all_members = json.loads(url.read().decode("latin-1").encode("utf-8"))
# make a list of all the MP IDs
all_ids = [x['person_id'] for x in all_members]
# make a comma separated list of all the MP IDs
all_ids_str = ','.join(all_ids)
# get a big json object of all the extra MP info!
url = "http://www.theyworkforyou.com/api/getMPsInfo?" + urllib.urlencode({
"key": constants.TWFY_API_KEY,
"id": all_ids_str,
})
url_data = urllib2.urlopen(url)
all_mp_extra_info = json.loads(url_data.read().decode("latin-1").encode("utf-8"))
all_data = [
{
'twfy_id': x['person_id'],
'name': x['name'],
'constituency': x['constituency'],
'interests': all_mp_extra_info[x['person_id']].get('wrans_subjects', None),
'twfy_dob': get_twfy_birthday(all_mp_extra_info[x['person_id']]),
'wiki_dob': get_wiki_birthday(all_mp_extra_info[x['person_id']]),
'wiki_url': all_mp_extra_info[x['person_id']].get('wikipedia_url', None),
'summary': mp_sz(all_mp_extra_info[x['person_id']].get('wikipedia_url', None))
} for x in all_members
]
with open('data-latest.json', 'w') as outfile:
json.dump(all_data, outfile)