/
restaurant_getter.py
111 lines (96 loc) · 3.96 KB
/
restaurant_getter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import re
import csv
from fetch import fetch
from BeautifulSoup import BeautifulSoup
def ParseRestaurant(rest_page_url):
"""Parses an UrbanSpoon restaurant page for the name, rating, and
address of the restaurant. Text values are encoded to UTF8 from unicode
for writing to CSV post-hoc. Note that HTML class names etc. may change."""
page = fetch(rest_page_url)
try:
soup = BeautifulSoup(page[1])
name = soup.findAll('h1','page-title fn org')[0].text.encode("utf8")
print "looking at %s"%name
## Address info
street_addr = soup.findAll('span','street-address')[0].text.encode("utf8")
suburb = soup.findAll('span','locality')[0].text.encode("utf8")
postcode = soup.findAll('a','quiet-link postal-code')[0].text.encode("utf8")
## Number of votes
try:
num_voted = int(soup.findAll('div',id='num-votes')[0].text.split(" ")[0])
except ValueError:
num_voted = 0
## Rating - if restaurant hasn't been rated, there isn't a span of than name
## And if there is, there might be something Weird
try:
rating = soup.findAll('span','percent-text rating average')[0].text
rating_as_num = int(rating[:-1]) # knock off the % sign
except (ValueError,IndexError):
rating_as_num = 0
rest_details = {"name":name,
"street":street_addr,
"suburb":suburb,
"postcode":postcode,
"number_voted":num_voted,
"rating":rating_as_num,}
return rest_details
except IndexError:
return {}
def ReadRestaurants(url):
"""Gets the details for restaurants on an UrbanSpoon page. Uses
the built-in google map to extract the locations of each restaurant."""
page = fetch(url)
soup = BeautifulSoup(page[1])
## Get the details for the restaurants on this table by going into their page
rest_table = soup.findAll("table",id="r-t")[0] # r-t is the id of the restaurants
rests = rest_table.findAll("div","t") # class "t" holds the restaurant entries
links_to_rests = [r.findAll("a",href=True)[0] for r in rests] # valid entries have a link in the href
## go get the restaurants
rest_dicts = [ParseRestaurant(x['href']) for x in links_to_rests]
## Get the lat/lons out of the map on the link page
scripts = soup.findAll("script")
map_script = scripts[-4] ## No idea if it will always be the 4th-last one
# regex for lat, lon pairs in text
lat_lon_re = re.compile("-?[0-9]{1,2}.[0-9]+, -?[0-9]{1,3}.[0-9]+")
# list of them. they will be in alphabetical order, same as the rest dicts
rest_lat_lons = [x.split(", ") for x in lat_lon_re.findall(map_script.text)]
# which means we can just zip them up and append the locations to the restaurant dict
for restaurant,location in zip(rest_dicts,rest_lat_lons):
restaurant["lat"] = float(location[0])
restaurant["lon"] = float(location[1])
return rest_dicts
def FillSubsequentPages(base):
## Get the restaurants for this page
rests = ReadRestaurants(base)
## Are there any pages after this?
page = fetch(base)
soup = BeautifulSoup(page[1])
try:
next_page_tag = [pg for pg in soup.findAll('a') if "next page" in pg.text][0]
except IndexError:
next_page_tag = False
## If so:
if next_page_tag:
## Tell it what the proper URL is
next_page_url = "http://www.urbanspoon.com" + next_page_tag['href']
## And get the restaurants on that page too
rests = rests + FillSubsequentPages(next_page_url)
return rests
else:
return rests
def WriteToCSV(dictlist,filename):
"""Writes the restaurant dict list to csv. Note hardcoded field order"""
# fields in desired order. could remove & write header line another way
# if we don't care about the order.
fields = ["name","rating","number_voted","street","suburb","postcode","lat","lon"]
# write it out to a csv
outf = file(filename,"wb")
dw = csv.DictWriter(outf,fields)
dw.writerow(dict((fn,fn) for fn in fields)) # header row
dw.writerows(dictlist)
outf.close()
return None
restaurant_list = \
"http://www.urbanspoon.com/n/71/47146/Melbourne/Northcote-restaurants"
testrests = FillSubsequentPages(restaurant_list)
WriteToCSV(testrests,"NorthernSuburbs.csv")