/
beerkeg.py
220 lines (169 loc) · 7.07 KB
/
beerkeg.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
import re
import webbrowser
from urlparse import urlparse
from utils import get_text, get_html, is_num, unique
class BeerKeg(object):
''' Beer Keg class '''
def __init__(self, url, num_attempts, verbose=False):
''' url must be a string containing the url for a single BevMo keg '''
self.url = url
''' Turn printing on or off '''
self.verbose = verbose
''' Prevent parsing more than once '''
self.parsed = False
''' The ratio of gallons of alcohol per dollar '''
self.ratio = None
''' Number of attempts to find ABV '''
self.num_attempts = num_attempts
def open(self):
webbrowser.open(self.url)
def parse(self):
''' retrieves the page and parses the contents into the following fields
self.name (May include brewery/brand and/or beer)
self.price (USD)
self.volume (Gallons)
self.num_avail (Kegs)
self.desc (Keg description)
'''
if self.parsed:
return
self.parsed = True
html = get_html(self.url)
''' Attempt to get name and volume '''
try:
self.name = html.xpath('//h1/text()')[0].strip()
if '(' in self.name and ')' in self.name:
split_name = self.name.split('(')
self.name = split_name[0].strip()
volume = filter(lambda x: is_num(x) if '.' not in x \
else x, split_name[1].strip(')').strip())
if is_num(volume):
self.volume = float(volume)
else:
self.volume = 0.0
else:
self.volume = 0.0
except Exception:
self.name = ''
self.volume = 0.0
''' Attempt to get price '''
try:
self.price = float(html.xpath('//span[@class="ProductDetailItemPric\
e"]/text()')[0].strip().strip('$'))
except Exception:
self.price = 0.0
''' Attempt to get number of available kegs '''
try:
self.num_avail = int(html.xpath('//em/text()\
')[0].strip().split()[0])
except Exception:
self.num_avail = 0
''' Attempt to get description '''
try:
self.desc = html.xpath('//td[@class="ProductDetailCell"]/p/text()\
')[0].strip()
except Exception:
self.desc = ''
def get_abv(self):
''' Attempts to find percentage of alcohol by volume using Bing '''
abv = ''
found_abv = ''
''' A ceiling for ABV content for validation
We can assume BevMo does not offer kegs with this high of an ABV
'''
max_abv = 20.0
if not self.parsed:
self.parse()
search_url = 'https://www.bing.com/search?q={0}+alcohol+content\
'.format('+'.join(self.name.split()))
search_links = get_html(search_url).xpath('//a/@href')
new_search_links = search_links[search_links.index('javascript:'):][1:]
results = [x for x in new_search_links if x != '#' and 'site:' not in x]
''' Max number of links to search for alcohol by volume (ABV) '''
num_attempts = self.num_attempts
''' Filter links with same domain to improve chances of matching '''
searched_domains = set()
''' Add the top page results that are unique, r_it is an iterator '''
top_results = []
r_it = 0
result_link = ''
while len(top_results) < num_attempts and r_it < len(results):
result_link = results[r_it]
domain = '{url.netloc}'.format(url=urlparse(result_link))
if '.' in domain:
if domain.count('.') > 1:
domain = domain.split('.')[1]
else:
domain = domain.split('.')[0]
''' Avoid already searched domains '''
if domain in searched_domains:
r_it += 1
else:
top_results.append(result_link)
r_it += 1
searched_domains.add(domain)
for i in xrange(min(num_attempts, len(top_results))):
if self.verbose:
print('Searching {}'.format(top_results[i]))
try:
search_text = ''.join(get_text(get_html(top_results[i])))
except Exception:
continue
''' Retrieves partial string containing the words ABV and a % '''
abv = re.search('(?<=[Aa][Bb][Vv])[^\d]*(\d+[.]?\d*)(?=%)|(?<=%)\
[^\d]*(\d+[.]?\d*)[^\d]*\
(?=[Aa][Bb][Cc])', search_text)
if abv:
abv = abv.group()
''' Filters for a number with or without a decimal pt '''
abv = float(re.search('(\d+[.]?\d*)', abv).group())
''' If new ABV is 0.0, return previously found ABV if any
otherwise, move onto the next link
'''
if abv == 0.0:
if found_abv:
if self.verbose:
print('ABV for {} is {}'.format(self.name, abv))
else:
continue
if abv < max_abv:
if abv < max_abv / 2:
if self.verbose:
print('ABV for {} is {}'.format(self.name, abv))
return abv
''' Replace the new ABV only if the next is lower '''
if found_abv:
if abv < found_abv:
if self.verbose:
print('ABV for {} is {}'.format(self.name, abv))
return abv
else:
if self.verbose:
print('ABV for {} is {}\
'.format(self.name, found_abv))
return found_abv
''' Sets the new ABV to the found ABV '''
found_abv = abv
else:
if found_abv:
if self.verbose:
print('ABV for {} is {}'.format(self.name, found_abv))
return found_abv
''' No ABV was found by this point '''
if self.verbose:
print('ABV not found for {}'.format(self.name))
return None
def get_ratio(self):
''' Returns the ratio of gallons of alcohol per USD '''
alcohol_pct = self.get_abv()
if alcohol_pct is not None:
try:
ratio = (alcohol_pct * .1 * self.volume) / self.price
except Exception:
return None
if self.verbose:
print('\tRatio: {}'.format(str(ratio)))
self.ratio = ratio
return ratio
else:
return None