/
download_csvs.py
222 lines (161 loc) · 6.29 KB
/
download_csvs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
import pandas as pd #this is a data analysis package
import requests #this is a package that allows us to send off web requests
from bs4 import BeautifulSoup #This allows us to parse xml files. There are better alternatives but I'm familiar with this one
import sys
import re #import regular expressions, text manipulation language
import os
import shutil
import logging
import gc
from my_website import upload_log
logger = logging.getLogger(__name__)
def download_csvs(csv_file_directory):
# shutil.rmtree(csv_file_directory)
# os.makedirs(csv_file_directory)
files = os.listdir(csv_file_directory)
files = [f for f in files if ".csv" in f]
files = [os.path.join(csv_file_directory,f) for f in files if "__" in f]
for f in files:
os.remove(f)
url = 'http://ratings.food.gov.uk/open-data/en-GB'
#Try to download the page at most 10 times.
try:
r = requests.get(url)
except Exception as e:
logger.error(e.message[1])
sys.exit()
data = r.text #Get html from above url - this is a list of all the xml links
soup = BeautifulSoup(data) #parse into dictionary-like structure to extract data
#Get a list of all of the hyperlinks of the page that are in English and contain FHRS data. Note re.compile is basically doing a search/filter on the links
all_links = soup.find_all("a",text = re.compile('English'),href = re.compile('FHRS'))
del r
del data
del soup
logger.debug(str(len(all_links)) + " links were found")
#Format:
links = [l["href"] for l in all_links]
if len(links)< 350:
logging.error("fewer than 350 xml files were found, there was some error")
sys.exit()
#a now contains a list of all the hyperlinks of xml we want to visit and download
#links = [link for link in links if "324" in link]
links_to_do = set(links)
#this is a list of fields that we want in our final table of data
fieldslist = ["FHRSID",
"LocalAuthorityBusinessID",
"BusinessName",
"BusinessType",
"BusinessTypeID",
"RatingValue",
"RatingKey",
"RatingDate",
"LocalAuthorityCode",
"LocalAuthorityName",
"LocalAuthorityWebSite",
"LocalAuthorityEmailAddress",
"Hygiene",
"Structural",
"ConfidenceInManagement",
"SchemeType",
"Longitude",
"Latitude",
"AddressLine1",
"AddressLine2",
"AddressLine3",
"PostCode",
"AddressLine4",
"RightToReply",
"NewRatingPending"
]
#convert to lowercase
fieldslist = [x.lower() for x in fieldslist]
#finalarr is an array which will contain a list of each row we want in the final dataset
import datetime
date_string = datetime.date.today().strftime("%Y%m%d")
#counter is just so we can keep track of progress, it isn't needed
counter_for_done = 0
counter_for_error = 0
all_links_len = len(links_to_do)
failed_count_dict = {link:0 for link in links_to_do}
while len(links_to_do)>0:
if counter_for_done % 10 ==0:
logger.debug("completed " + str(counter_for_done) + " xml downloads")
upload_log()
if counter_for_error > all_links_len/3:
logger.error("Even after retrying the downloads, we were unable to download all the links. Exiting")
sys.exit()
this_link = links_to_do.pop()
#download data
try:
r = requests.get(this_link)
except Exception as e:
logger.error(e.message[1])
sys.exit()
if "Internal Server Error" in r.text:
links_to_do.add(this_link)
logger.debug("Internal server error on link: " + this_link)
continue
#parse data
try:
unicode_text = r.text.encode("latin1").decode("utf-8")
except:
logger.debug("Can't convert text reponse from latin1 to unicode on link: " + this_link)
continue
try:
soup = BeautifulSoup(unicode_text)
del r
except:
#If this goes wrong put link back into pile
links_to_do.add(this_link)
logger.debug("Can't convert to soup on link: " + this_link)
continue
#find list of establishments
try:
est = soup.find_all("establishmentdetail")
except:
links_to_do.add(this_link)
logger.debug("Can't find establishmentdetail in link: " + this_link)
continue
#
if len(est) <1:
failed_count_dict[this_link] +=1
if failed_count_dict[this_link] > 3:
#Give up on this one
counter_for_error +=1
logger.debug("Can't find any establishmentdetails in link even after 3 attempts: " + this_link)
continue
else:
#Try again
links_to_do.add(this_link)
continue
#for each establishment, find the data in each field and add to dictionary
finalarr = []
for i in est:
this_dict = {}
for j in fieldslist:
te = None
try:
te = i.find(j).text
except:
pass
this_dict[j] = te
finalarr.append(this_dict) #add dictionary to array
#Check that the csv looks ok:
df = pd.DataFrame(finalarr)
#Does it have more than one row?
if df.shape[0] < 1:
links_to_do.add(this_link)
logger.debug("Can't find any premesis in link: " + this_link)
continue
#Now write this to csv file
file_name = this_link.replace(r"http://ratings.food.gov.uk/OpenDataFiles/","").replace("en-GB.xml","")
file_name = os.path.join(csv_file_directory, date_string+"__"+file_name+".csv")
df.to_csv(file_name, encoding="utf-8", index=False)
counter_for_done +=1
del df
del finalarr
gc.collect()
for i in failed_count_dict:
if failed_count_dict[i]>3:
logger.warning("the file " + i + " contained no establishments")
logger.info("completed successfully")