/
nosh_parse.py
165 lines (127 loc) · 3.92 KB
/
nosh_parse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#this file is a command line script.
#it goes to a url in nosh and pulls that restaurants menu list
# 1. The script takes a restaurant url on the command line
# 2. fetches the html from the page
# 3. parses just the menu item information out
# 4. writes menu item information to a .csv file.
# Put the menu item name as the first value in the csv row,
# and the description (if it exists), as the second value.
# find and replace function for symbols
def findAndReplace(string):
print string
string.replace('&', 'W')
string.replace(''', 'Q')
print string
return
# 1. Get to url from the command line.
# Use OptionParer from optparse
from optparse import OptionParser
parser = OptionParser()
# There are no options for this program
# There is one positional argument, which is the url of
# the restuarant in nosh
# example of url is http://www.nosh.com/restaurant/2630123
(options, args) = parser.parse_args()
######
#Do error handling for the number of arguments
if (len(args) < 1 or len(args) > 1):
import sys
sys.exit("To many arguments. \n\
An example of the command line should look like: \n\
python nosh_parse.py http://www.nosh.com/restuarant/2630123")
######
# get url
url = args[0]
# 2. fetch the html from the url
#import urllib. Fetches over HTTP
import urllib
website = urllib.urlopen(url)
#read from website. Store contents
html_contents = website.read()
#close website
website.close()
# 3. Parse the html!
#import BeautifulSoup! This is BeautifulSoup3
from BeautifulSoup import BeautifulSoup
#import regular expression library
import re
soup = BeautifulSoup(''.join(html_contents))
#get all items in the menu
items = soup.findChildren('div', id=re.compile("^item-"))
#get items and reviews
names = []
reviews = []
i = 0;
#import Html parser for &'s and 's
import HTMLParser
html = HTMLParser.HTMLParser()
while i < len(items):
my_table = items[i]
#get name
name_row = my_table.findNext('div', attrs={"class" : 'ow-check-in-mi'})
if (name_row):
name = name_row.findNext('a', href=re.compile("^/menuitem/"))
#str = name.contents[0].contents[0]
str = name.string # only 1 child. name.contents[0] also works
#print str
#str = html.unescape(str)
names.append(str)
#get review
review = name_row.findNextSibling('div', attrs={"class" : 'ow-check-in-review'})
#print review
if (review):
#rev_str = review.contents[1].contents[0]
review = my_table.findNext('span')
if (review):
rev_str = review.string # only 1 child. name.contents[0] also works
#rev_str = html.unescape(rev_str)
reviews.append(rev_str)
#print rev_str
else:
review.append('')
else:
reviews.append('')
"""
name_row = my_table.findChildren('div', attrs={"class" : 'ow-check-in-mi'})
# get name
for a in name_row:
#You are now at each child in this row
str = a.contents[0].contents[0]
str = html.unescape(str)
names.append(str)
#names.append(a.contents[0].contents[0])
#print names[i]
# get review
review_row = my_table.findChildren('div', attrs={"class" : 'ow-check-in-review'})
#print review_row
#print len(review_row)
if (len(review_row) > 0):
for a in review_row:
temp = a.contents[1].contents[0]#.encode('utf-8')
#temp = html.unescape(temp)
reviews.append(temp)
else:
# if you get here then it means there was no review for the menu item
reviews.append('')
"""
i = i + 1
# len of names and reviews should be the same
# 4. Write to a csv file
# import the csv file writing library
import csv
# get the number of the restaurant from the url. You still have url!
# example: http://www.nosh.com/restaurant/2630123
# first 31 characters are http://www.nosh.com/restaurant/
# get the substring of the url with only the number.
filename = url[31:] + '.csv'
with open(filename, 'wb') as csvfile:
writer = csv.writer(csvfile, delimiter=',',
quotechar='"')
i = 0
while (i < len(names)):
str1 = names[i]
str2 = reviews[i]
list = [str1, str2]
writer.writerow(list)
i = i + 1
# done!