/
tallriksskrapan.py
260 lines (224 loc) · 8.69 KB
/
tallriksskrapan.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
# -*- coding: utf-8 -*-
import requests
import urllib.request
import io
import json
from helpers import utf8text
from lxml import html
from pdfminer.pdfparser import PDFParser, PDFDocument
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LAParams, LTTextBox, LTTextLine
from docx import Document
week_number = 0
lastWeek = "0"
def parse_vecka():
answer = requests.get('http://www.vecka.nu')
root = html.fromstring(answer.text)
for child in root.xpath('//time'):
global week_number
week_number = child.text
lastWeek = "52" if week_number == "1" else str(int(week_number) - 1)
return 'Det är nu vecka %s' % week_number
def parse_kompassen():
ret = "### KOMPASSEN ###" + "\n"
answer = requests.get('http://www.restaurangkompassen.se/index.php?option=com_content&view=article&id=64&Itemid=66')
root = html.fromstring(answer.text)
friday_found = False
for child in root.xpath('//div[@class="screen"]/div/div/div'):
if friday_found and child.text:
ret += child.text
elif child.text and "fredag" in child.text.lower():
friday_found = True
return ret
def parse_teknikparken():
ret = "### TEKNIKPARKEN ###" + "\n"
answer = requests.get('http://www.restaurangteknikparken.se/index.php?option=com_content&view=article&id=46')
root = html.fromstring(answer.text)
friday_found = False
for child in root.xpath('//div[@class="screen"]/div/div/div'):
if friday_found and child.text:
ret += child.text
elif child.text and "fredag" in child.text.lower():
friday_found = True
return ret
def parse_gs():
ret = "### Gourmetservice ###" + "\n"
answer = requests.get('http://www.geflegourmetservice.se/lunch.php')
root = html.fromstring(answer.text)
for child in root.xpath('//div[@class="left_holder"]/p')[1:3]:
ret += child.text_content()
return ret
def parse_hemlingby():
ret = "### HEMLINGBY ###" + "\n"
answer = requests.get('http://www.gavle.se/Uppleva--gora/Idrott-motion-och-friluftsliv/Friluftsliv-och-motion/Hemlingby-friluftsomrade/Hemlingbystugan/Fika-och-ata/')
root = html.fromstring(answer.text)
for child in root.xpath('//a'):
if child.text and "meny vecka" in child.text.lower() and week_number in child.text.lower():
hemlingby_link='http://www.gavle.se' + child.get('href')
break
textAsArray = parse_pdf(hemlingby_link)
ret += getFoodFromPDFArray(textAsArray)
return ret
#Takes url to pdf file and returns text split on newline into array
def parse_pdf(pdf_url):
remote_file = urllib.request.urlopen(pdf_url).read()
memory_file = io.BytesIO(remote_file)
parser = PDFParser(memory_file)
doc = PDFDocument()
parser.set_document(doc)
#Warning sometimes, error in pdf?
doc.set_parser(parser)
doc.initialize('')
rsrcmgr = PDFResourceManager()
laparams = LAParams()
device = PDFPageAggregator(rsrcmgr, laparams=laparams)
interpreter = PDFPageInterpreter(rsrcmgr, device)
ret = []
# Process each page contained in the document.
for pageIdx, page in enumerate(doc.get_pages()):
ret.append([])
interpreter.process_page(page)
layout = device.get_result()
for idx, lt_obj in enumerate(layout):
if isinstance(lt_obj, LTTextBox) or isinstance(lt_obj, LTTextLine):
if len(lt_obj.get_text().strip()) > 0:
ret[pageIdx].append((lt_obj.get_text().splitlines()))
return ret
def getFoodFromPDFArray(pdfArray):
correctWeek = False
ret = ""
for page in pdfArray:
for idx, line in enumerate(page):
#Check if correct week
if ("vecka " + week_number + ":") in line[0]:
correctWeek = True;
continue;
#If correct week and day is fredag
elif correctWeek and "fredag" in line[0].lower():
#If the line is bigger than 1 it contains the food after 'fredag'
if len(line) > 1:
for x in range(1, len(line)):
ret += line[x] + "\n"
#The line only contained 'fredag' so the food is in the line after 'fredag'
else:
line = page[idx+1]
for x in range(0, len(line)):
ret += line[x] + "\n"
return ret
return "Oops something went wrong"
def parse_gustafsbro():
ret = "### Gustafsbro ###" + "\n"
answer = requests.get('http://www.gavlelunch.se/gustafsbro.asp')
root = html.fromstring(answer.text)
friday_found = False
#Get friday from table
for weekdayTable in root.xpath('//body/font/table/tr[1]/td[1]/div/table'):
for day in weekdayTable.xpath('tr[1]/td[1]/font/strong'):
if day.text and "fredag" in day.text.lower():
friday_found = True
break
#If friday is found print food
if friday_found:
for food in weekdayTable.xpath('tr[2]/td[1]/font/ul/li'):
ret += food.text.strip() + "\n"
else:
ret += "Oops something went wrong"
return ret
def parse_sodersKalla():
ret = "### Söders källa ###" + "\n"
url = ""
answer = requests.get('http://www.soderskalla.se/restaurangen/')
root = html.fromstring(answer.text)
#Get url for menu
for child in root.xpath('//a'):
if child.text and ("lunchmeny v" + week_number) in child.text.lower():
url = child.get('href')
break
#Check if menu has been updated from the week before
elif child.text and ("lunchmeny v" + lastWeek) in child.text.lower():
ret += "Menyn har ännu inte blivit uppdaterad"
#Fetch document
answer = requests.get(url)
memory_file = io.BytesIO(answer.content)
doc = Document(memory_file)
food = ""
#Parse document and look for fredag, food is in the index after fredag
for idx, para in enumerate(doc.paragraphs):
if "fredag" in para.text.lower():
food = doc.paragraphs[idx+1].text + "\n"
if food:
ret += food
else:
ret += "Oops something went wrong"
return ret
def parse_koket():
ret = "### Köket ###" + "\n"
answer = requests.get('http://koketlunch.se/meny.html')
root = html.fromstring(answer.text)
friday_found = False
food = ""
for line in root.xpath('//p/span'):
#Get friday from table
if line.text and "fredag" in line.text.lower():
friday_found = True
if friday_found:
if line.text.strip():
#Fix encodings and remove '-' in the beginning of the different foods
#Removed "fredag" printout to look more like the other printouts //Robert
if "fredag" in utf8text(line.text).lower():
pass
#ret += utf8text(line.text) + "\n"
elif "stängt" in utf8text(line.text).lower():
food += utf8text(line.text) + "\n"
else:
food += utf8text(line.text)[1:] + "\n"
else:
break
if food:
ret += food
else:
ret += "Oops something went wrong"
return ret
def parse_kryddan():
ret = "### Kryddan ###" + "\n"
answer = requests.get('http://www.kryddan35.se/hem/')
root = html.fromstring(answer.text)
friday_found = False
food = ""
for child in root.xpath('//div[@id="veckans"]'):
lines = child.text_content().split("\n")
for line in lines:
if friday_found and child.text:
food += line + "\n"
elif line and "fredag" in line.lower():
friday_found = True
if food:
ret += food
else:
ret += "Oops something went wrong"
return ret
def get_json_encode():
vecka = parse_vecka()
teknikparken = parse_teknikparken()
kompassen= parse_kompassen()
hemlingby = parse_hemlingby()
gs = parse_gs()
gustafsbro= parse_gustafsbro()
sodersKalla = parse_sodersKalla()
koket = parse_koket()
kryddan = parse_kryddan()
return json.dumps({'vecka':week_number, 'teknikparken':teknikparken, 'kompassen':kompassen, 'hemlingby':hemlingby, 'gs':gs, 'gustafsbro':gustafsbro, 'sodersKalla':sodersKalla, 'koket':koket, 'kryddan':kryddan})
def main():
parse_vecka()
print("\n" +
parse_teknikparken() + "\n" +
parse_kompassen() + "\n" +
parse_hemlingby() + "\n" +
parse_gs()+ "\n" +
parse_gustafsbro() + "\n" +
parse_sodersKalla() + "\n" +
parse_koket() + "\n" +
parse_kryddan())
if __name__ == '__main__':
main()