def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) try: s_contents = soup.find( 'div', id='container').find('table').find_all('tr')[0].find_all('td') except AttributeError: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = re.findall(r'Type: (.*?)<br\/>', soup.decode_contents())[0].strip() if soup.find_all(string='Web: '): self.web = soup.find_all(string='Web: ')[0].find_next()['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def _populate(self): """Returns information about a specific brewery. Args: url (string): The specific url of the beer. Looks like: "/brewers/new-belgium-brewing-company/77/" Returns: A dictionary of attributes about that brewery.""" soup = soup_helper._get_soup(self.url) s_contents = soup.find_all( 'div', {'itemtype': 'http://schema.org/LocalBusiness'}) if not s_contents: raise rb_exceptions.PageNotFound(self.url) self.name = soup.h1.text self.type = s_contents[0].find_all('div')[1].text.strip() website = s_contents[0].find_all( 'div', {'class': 'media-links'})[0].find_all('a')[0] if website: self.web = website['href'] self.telephone = Brewery._find_span(s_contents[0], 'telephone') self.street = Brewery._find_span(s_contents[0], 'streetAddress') self.city = Brewery._find_span(s_contents[0], 'addressLocality') self.state = Brewery._find_span(s_contents[0], 'addressRegion') self.country = Brewery._find_span(s_contents[0], 'addressCountry') self.postal_code = Brewery._find_span(s_contents[0], 'postalCode') self._has_fetched = True return self
def _get_soup(url): if _BASE_URL in url: url.replace(_BASE_URL, "") req = requests.get(_BASE_URL + url, allow_redirects=True) if "ratebeer robot oops" in req.text.lower(): raise rb_exceptions.PageNotFound(url) return BeautifulSoup(req.text, "lxml")
def _get_soup(url): if _BASE_URL in url: url.replace(_BASE_URL, '') req = requests.get(_BASE_URL + url, allow_redirects=True) if '<meta http-equiv="Content-Type" content="text/html;" charset="utf-8">' in req.text: req.encoding = 'utf-8' if "ratebeer robot oops" in req.text.lower(): raise rb_exceptions.PageNotFound(url) return BeautifulSoup(req.text, "lxml")
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not # if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: # raise rb_exceptions.PageNotFound(self.url) # if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: # raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href']) if soup_rows is None: raise rb_exceptions.PageNotFound(self.url) # General information from the top of the page self.name = soup.find(itemprop='name').text.strip() breweries = soup.find_all('a', href=re.compile('brewers')) self.brewery = Brewery(breweries[1].get('href')) self.brewery.name = breweries[1].text if len(breweries) == 3: self.brewed_at = Brewery(breweries[2].get('href')) self.brewed_at.name = breweries[2].text else: self.brewed_at = None try: self.overall_rating = int( soup.find('span', text='overall').next_sibling.next_sibling.text) except ValueError: # 'n/a' self.overall_rating = None except AttributeError: self.overall_rating = None try: self.style_rating = int( soup.find('span', text='style').previous_sibling.previous_sibling) except ValueError: # 'n/a' self.style_rating = None except AttributeError: self.style_rating = None self.style = soup.find(text='Style: ').next_sibling.text self.style_url = soup.find(text='Style: ').next_sibling.get('href') self.img_url = soup.find(id="beerImg").get('src') # Data from the info bar self.num_ratings = int(soup.find('span', itemprop="ratingCount").text) try: self.mean_rating = float( soup.find(text='MEAN: ').next_sibling.text.split('/')[0]) except ValueError: # Empty mean rating: '/5.0' self.mean_rating = None except AttributeError: # No mean rating self.mean_rating = None try: self.weighted_avg = float( soup.find(attrs={ "name": "real average" }).find('span', itemprop="ratingValue").text) except ValueError: # Empty weighted average rating: '/5' self.weighted_avg = None except AttributeError: # No weighted average rating self.weighted_avg = None try: self.seasonal = soup.find( text=u'\xa0\xa0 SEASONAL: ').next_sibling.text except AttributeError: self.seasonal = None try: self.ibu = int( soup.find( title="International Bittering Units - Normally from hops" ).next_sibling.next_sibling.text) except AttributeError: self.ibu = None try: self.calories = int( soup.find( title="Estimated calories for a 12 fluid ounce serving"). next_sibling.next_sibling.text) except AttributeError: self.calories = None try: self.abv = float( soup.find(title="Alcohol By Volume").next_sibling.next_sibling. text[:-1]) except ValueError: # Empty ABV: '-' self.abv = None if soup.find(title="Currently out of production"): self.retired = True else: self.retired = False # Description description = soup.find('div', 'commercial-description-container') if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings ]).strip() self.tags = [ t.text[1:] for t in soup.find_all('span', class_="tagLink") ] self._has_fetched = True return self
def _populate(self): soup = soup_helper._get_soup(self.url) # check for 404s try: soup_rows = soup.find('div', id='container').find('table').find_all('tr') except AttributeError: raise rb_exceptions.PageNotFound(self.url) # ratebeer pages don't actually 404, they just send you to this weird # "beer reference" page but the url doesn't actually change, it just # seems like it's all getting done server side -- so we have to look # for the contents h1 to see if we're looking at the beer reference or # not if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents: raise rb_exceptions.PageNotFound(self.url) if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents: raise rb_exceptions.AliasedBeer( self.url, soup_rows[1].find_all('td')[1].div.div.a['href']) # get beer meta information # grab the html and split it into a keyword and value brew_info_html = soup_rows[1].find_all('td')[1].div.small brew_info = [ s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0') ] keyword_lookup = { "RATINGS": "num_ratings", "MEAN": "mean_rating", "WEIGHTED AVG": "weighted_avg", "SEASONAL": "seasonal", "CALORIES": "calories", "EST. CALORIES": "calories", "ABV": "abv", "IBU": "ibu", } # match the data pulled from the brew info and match it to they keyword # in the lookup table for meta_name, meta_data in brew_info: match = keyword_lookup.get(meta_name.strip()) if match == "mean": meta_data = meta_data[:meta_data.find("/")] elif match == "abv": meta_data = meta_data[:-1] elif not match: continue # convert to float if possible try: if match == "num_ratings": meta_data = int(meta_data) else: meta_data = float(meta_data) except ValueError: pass setattr(self, match, meta_data) info = soup_rows[1].tr.find_all('td') # get basic brewery information brewery_info = info[1].find('div').contents brewery_urls = brewery_info[0].findAll('a') brewery = brewery_urls[0] brewed_at = None if len(brewery_urls) == 2: brewed_at = brewery_urls[1] if brewery: self.brewery = brewery.text.strip() self.brewery_url = brewery.get('href') if brewed_at: self.brewed_at = brewed_at.text.strip() self.brewed_at_url = brewed_at.get('href') # get ratings ratings = info[0].findAll('div') if len(ratings) > 3: self.overall_rating = ratings[1].contents[2] self.style_rating = ratings[3].contents[0] # get the beer style if brewery_info[3]: self.style = brewery_info[3].text.strip() # get the beer country if ',' in brewery_info[5]: # Non-USA addresses self.brewery_country = brewery_info[5].split(',')[1].strip() else: # USA addresses self.brewery_country = brewery_info[8].strip() # get the beer description description = soup_rows[1].find_all('td')[1].find( 'div', style=('border: 1px solid #e0e0e0; background: #fff; ' 'padding: 14px; color: #777;')) if 'no commercial description' not in description.text.lower(): # strip ads [s.extract() for s in description('small')] self.description = ' '.join([s for s in description.strings ]).strip() # get name self.name = soup_rows[0].find_all('td')[1].h1.text.strip() self._has_fetched = True return self
def _populate(self): if not self.id: self.id = self.url.split('/')[-2] data = [ {"operationName":"beer", "variables":{"beerId":self.id}, "query":"query beer($beerId: ID!) { \n info: beer(id: $beerId) { \n id \n name \n description \n style { \n id \n name \n glasses { \n id \n name \n __typename \n } \n __typename \n } \n styleScore \n overallScore \n averageRating \n abv \n ibu \n calories \n brewer { \n id \n name \n __typename \n } \n ratingCount \n isRetired \n isUnrateable \n seasonal \n labels \n availability { \n bottle \n tap \n distribution \n __typename \n } \n __typename \n } \n} \n"}, # {"operationName":"beerReviews", # "variables": # {"beerId":self.id, # "order":"RECENT", # "first":10 # }, # "query":"query beerReviews($beerId: ID!, $authorId: ID, $order: ReviewOrder, $after: ID) { \n beerReviewsArr: beerReviews(beerId: $beerId, authorId: $authorId, order: $order, after: $after) { \n items { \n id \n comment \n score \n scores { \n appearance \n aroma \n flavor \n mouthfeel \n overall \n __typename \n } \n author { \n id \n username \n reviewCount \n __typename \n } \n checkin { \n id \n place { \n name \n city \n state { \n name \n __typename \n } \n country { \n name \n __typename \n } \n __typename \n } \n __typename \n } \n createdAt \n updatedAt \n __typename \n } \n totalCount \n last \n __typename \n } \n} \n"}, {"operationName":"beerByAlias", "variables":{"aliasId":self.id}, "query":"query beerByAlias($aliasId: ID!) {\n beerByAlias(aliasId: $aliasId) {\n id\n name \n overallScore \n __typename \n } \n } \n"}, {"operationName":"tagDisplay", "variables":{"beerId":self.id}, "query":"query tagDisplay($beerId: ID!, $first: Int) { \n tagDisplayArr: beerTags(beerId: $beerId, first: $first) { \n items { \n id \n urlName: plain \n __typename \n } \n __typename \n } \n} \n" } ] request = requests.post( "https://beta.ratebeer.com/v1/api/graphql/" ,data=json.dumps(data) ,headers={"content-type": "application/json"} ) try: results = json.loads(request.text) except: raise rb_exceptions.JSONParseException(self.id) beer_data = results[0]['data']['info'] if beer_data == None: raise rb_exceptions.PageNotFound(self.id) alias_data = results[1]['data']['beerByAlias'] if alias_data != None: # Resolve the alias: we have all the information. We decide to just # return the alias and not the original beer from the query. self.name = alias_data['name'] self.id = alias_data['id'] self.url = '/beer/{0}/{1}/'.format( self.name.replace(' ', '-').lower(), self.id ) return self._populate() tag_data = results[2]['data']['tagDisplayArr']['items'] self.name = beer_data['name'] self.brewery = Brewery('/brewers/{0}/{1}/'.format(re.sub('[/ ]','-',beer_data['brewer']['name'].lower()),beer_data['brewer']['id'])) self.brewery.name = beer_data['brewer']['name'] self.brewed_at = None #no longer supported self.overall_rating = self._format(beer_data['overallScore']) self.style_rating = self._format(beer_data['styleScore']) self.style = beer_data['style']['name'] self.style_url = "/beerstyles/{0}/{1}/".format(re.sub('/','-',self.style.lower()), beer_data['style']['id']) self.img_url = "https://res.cloudinary.com/ratebeer/image/upload/w_152,h_309,c_pad,d_beer_img_default.png,f_auto/beer_{0}".format(self.id) self.num_ratings = self._format(beer_data['ratingCount']) self.mean_rating = self._format(beer_data['averageRating']) self.weighted_avg = None # does not appear to exist anymore if(beer_data['seasonal'] != 'UNKNOWN'): self.seasonal = beer_data['seasonal'] else: self.seasonal = None self.ibu = self._format(beer_data['ibu']) self.calories = self._format(beer_data['calories']) self.abv = self._format(beer_data['abv']) self.retired = beer_data['isRetired'] if beer_data['description']: self.description = re.sub(r'\x92', '\'', beer_data['description']) else: self.description = None if tag_data: self.tags = [t['urlName'] for t in tag_data] else: self.tags = None self._has_fetched = True return self
def _populate(self): if not self.id: self.id = self.url.split("/")[-2] data = [ { "operationName": "beer", "variables": { "beerId": self.id }, "query": "query beer($beerId: ID!) { \n info: beer(id: $beerId) { \n id \n name \n description \n style { \n id \n name \n glasses { \n id \n name \n __typename \n } \n __typename \n } \n styleScore \n overallScore \n averageRating \n abv \n ibu \n calories \n brewer { \n id \n name \n country \n { \n code \n name \n __typename \n } \n __typename \n } \n ratingCount \n isRetired \n isUnrateable \n seasonal \n labels \n availability { \n bottle \n tap \n distribution \n __typename \n } \n __typename \n } \n} \n", }, # {"operationName":"beerReviews", # "variables": # {"beerId":self.id, # "order":"RECENT", # "first":10 # }, # "query":"query beerReviews($beerId: ID!, $authorId: ID, $order: ReviewOrder, $after: ID) { \n beerReviewsArr: beerReviews(beerId: $beerId, authorId: $authorId, order: $order, after: $after) { \n items { \n id \n comment \n score \n scores { \n appearance \n aroma \n flavor \n mouthfeel \n overall \n __typename \n } \n author { \n id \n username \n reviewCount \n __typename \n } \n checkin { \n id \n place { \n name \n city \n state { \n name \n __typename \n } \n country { \n name \n __typename \n } \n __typename \n } \n __typename \n } \n createdAt \n updatedAt \n __typename \n } \n totalCount \n last \n __typename \n } \n} \n"}, { "operationName": "beerByAlias", "variables": { "aliasId": self.id }, "query": "query beerByAlias($aliasId: ID!) {\n beerByAlias(aliasId: $aliasId) {\n id\n name \n overallScore \n __typename \n } \n } \n", }, { "operationName": "tagDisplay", "variables": { "beerId": self.id }, "query": "query tagDisplay($beerId: ID!, $first: Int) { \n tagDisplayArr: beerTags(beerId: $beerId, first: $first) { \n items { \n id \n urlName: plain \n __typename \n } \n __typename \n } \n} \n", }, ] request = requests.post( "https://beta.ratebeer.com/v1/api/graphql/", data=json.dumps(data), headers={"content-type": "application/json"}, ) try: results = json.loads(request.text) except: raise rb_exceptions.JSONParseException(self.id) beer_data = results[0]["data"]["info"] if beer_data == None: raise rb_exceptions.PageNotFound(self.id) alias_data = results[1]["data"]["beerByAlias"] if alias_data != None: raise rb_exceptions.AliasedBeer(self.id, alias_data["id"]) tag_data = results[2]["data"]["tagDisplayArr"]["items"] self.name = beer_data["name"] # self.brewery = Brewery( # "/brewers/{0}/{1}/".format( # re.sub("[/ ]", "-", beer_data["brewer"]["name"].lower()), # beer_data["brewer"]["id"], # ) # ) brewery_country = beer_data["brewer"]["country"]["name"] if brewery_country == "United States": brewery_country = "USA" self.brewery_country = brewery_country self.brewery_name = beer_data["brewer"]["name"] self.brewed_at = None # no longer supported self.overall_rating = self._format(beer_data["overallScore"]) self.style_rating = self._format(beer_data["styleScore"]) self.style = beer_data["style"]["name"] self.style_url = "/beerstyles/{0}/{1}/".format( re.sub("/", "-", self.style.lower()), beer_data["style"]["id"]) self.img_url = "https://res.cloudinary.com/ratebeer/image/upload/w_152,h_309,c_pad,d_beer_img_default.png,f_auto/beer_{0}".format( self.id) self.num_ratings = self._format(beer_data["ratingCount"]) self.mean_rating = self._format(beer_data["averageRating"]) self.weighted_avg = None # does not appear to exist anymore if beer_data["seasonal"] != "UNKNOWN": self.seasonal = beer_data["seasonal"] else: self.seasonal = None self.ibu = self._format(beer_data["ibu"]) self.calories = self._format(beer_data["calories"]) self.abv = self._format(beer_data["abv"]) self.retired = beer_data["isRetired"] self.description = re.sub(r"\x92", "'", beer_data["description"]) if tag_data: self.tags = [t["urlName"] for t in tag_data] else: self.tags = None self._has_fetched = True return self