コード例 #1
0
    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        try:
            s_contents = soup.find(
                'div',
                id='container').find('table').find_all('tr')[0].find_all('td')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = re.findall(r'Type: (.*?)<br\/>',
                               soup.decode_contents())[0].strip()
        if soup.find_all(string='Web: '):
            self.web = soup.find_all(string='Web: ')[0].find_next()['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self
コード例 #2
0
    def _populate(self):
        """Returns information about a specific brewery.

        Args:
            url (string): The specific url of the beer. Looks like:
                "/brewers/new-belgium-brewing-company/77/"

        Returns:
            A dictionary of attributes about that brewery."""

        soup = soup_helper._get_soup(self.url)
        s_contents = soup.find_all(
            'div', {'itemtype': 'http://schema.org/LocalBusiness'})
        if not s_contents:
            raise rb_exceptions.PageNotFound(self.url)

        self.name = soup.h1.text
        self.type = s_contents[0].find_all('div')[1].text.strip()
        website = s_contents[0].find_all(
            'div', {'class': 'media-links'})[0].find_all('a')[0]
        if website:
            self.web = website['href']
        self.telephone = Brewery._find_span(s_contents[0], 'telephone')
        self.street = Brewery._find_span(s_contents[0], 'streetAddress')
        self.city = Brewery._find_span(s_contents[0], 'addressLocality')
        self.state = Brewery._find_span(s_contents[0], 'addressRegion')
        self.country = Brewery._find_span(s_contents[0], 'addressCountry')
        self.postal_code = Brewery._find_span(s_contents[0], 'postalCode')
        self._has_fetched = True

        return self
コード例 #3
0
def _get_soup(url):
    if _BASE_URL in url:
        url.replace(_BASE_URL, "")
    req = requests.get(_BASE_URL + url, allow_redirects=True)
    if "ratebeer robot oops" in req.text.lower():
        raise rb_exceptions.PageNotFound(url)
    return BeautifulSoup(req.text, "lxml")
コード例 #4
0
ファイル: soup.py プロジェクト: wboccard/ratebeer
def _get_soup(url):
    if _BASE_URL in url:
        url.replace(_BASE_URL, '')
    req = requests.get(_BASE_URL + url, allow_redirects=True)
    if '<meta http-equiv="Content-Type" content="text/html;" charset="utf-8">' in req.text:
        req.encoding = 'utf-8'
    if "ratebeer robot oops" in req.text.lower():
        raise rb_exceptions.PageNotFound(url)
    return BeautifulSoup(req.text, "lxml")
コード例 #5
0
    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            soup_rows = soup.find('div',
                                  id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        # if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
        #     raise rb_exceptions.PageNotFound(self.url)

        # if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
        #     raise rb_exceptions.AliasedBeer(self.url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        if soup_rows is None:
            raise rb_exceptions.PageNotFound(self.url)

        # General information from the top of the page
        self.name = soup.find(itemprop='name').text.strip()
        breweries = soup.find_all('a', href=re.compile('brewers'))
        self.brewery = Brewery(breweries[1].get('href'))
        self.brewery.name = breweries[1].text
        if len(breweries) == 3:
            self.brewed_at = Brewery(breweries[2].get('href'))
            self.brewed_at.name = breweries[2].text
        else:
            self.brewed_at = None
        try:
            self.overall_rating = int(
                soup.find('span',
                          text='overall').next_sibling.next_sibling.text)
        except ValueError:  # 'n/a'
            self.overall_rating = None
        except AttributeError:
            self.overall_rating = None
        try:
            self.style_rating = int(
                soup.find('span',
                          text='style').previous_sibling.previous_sibling)
        except ValueError:  # 'n/a'
            self.style_rating = None
        except AttributeError:
            self.style_rating = None
        self.style = soup.find(text='Style: ').next_sibling.text
        self.style_url = soup.find(text='Style: ').next_sibling.get('href')
        self.img_url = soup.find(id="beerImg").get('src')
        # Data from the info bar
        self.num_ratings = int(soup.find('span', itemprop="ratingCount").text)
        try:
            self.mean_rating = float(
                soup.find(text='MEAN: ').next_sibling.text.split('/')[0])
        except ValueError:  # Empty mean rating: '/5.0'
            self.mean_rating = None
        except AttributeError:  # No mean rating
            self.mean_rating = None
        try:
            self.weighted_avg = float(
                soup.find(attrs={
                    "name": "real average"
                }).find('span', itemprop="ratingValue").text)
        except ValueError:  # Empty weighted average rating: '/5'
            self.weighted_avg = None
        except AttributeError:  # No weighted average rating
            self.weighted_avg = None
        try:
            self.seasonal = soup.find(
                text=u'\xa0\xa0 SEASONAL: ').next_sibling.text
        except AttributeError:
            self.seasonal = None
        try:
            self.ibu = int(
                soup.find(
                    title="International Bittering Units - Normally from hops"
                ).next_sibling.next_sibling.text)
        except AttributeError:
            self.ibu = None
        try:
            self.calories = int(
                soup.find(
                    title="Estimated calories for a 12 fluid ounce serving").
                next_sibling.next_sibling.text)
        except AttributeError:
            self.calories = None
        try:
            self.abv = float(
                soup.find(title="Alcohol By Volume").next_sibling.next_sibling.
                text[:-1])
        except ValueError:  # Empty ABV: '-'
            self.abv = None
        if soup.find(title="Currently out of production"):
            self.retired = True
        else:
            self.retired = False
        # Description
        description = soup.find('div', 'commercial-description-container')
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings
                                         ]).strip()
        self.tags = [
            t.text[1:] for t in soup.find_all('span', class_="tagLink")
        ]

        self._has_fetched = True

        return self
コード例 #6
0
    def _populate(self):
        soup = soup_helper._get_soup(self.url)
        # check for 404s
        try:
            soup_rows = soup.find('div',
                                  id='container').find('table').find_all('tr')
        except AttributeError:
            raise rb_exceptions.PageNotFound(self.url)
        # ratebeer pages don't actually 404, they just send you to this weird
        # "beer reference" page but the url doesn't actually change, it just
        # seems like it's all getting done server side -- so we have to look
        # for the contents h1 to see if we're looking at the beer reference or
        # not
        if "beer reference" in soup_rows[0].find_all('td')[1].h1.contents:
            raise rb_exceptions.PageNotFound(self.url)

        if "Also known as " in soup_rows[1].find_all('td')[1].div.div.contents:
            raise rb_exceptions.AliasedBeer(
                self.url, soup_rows[1].find_all('td')[1].div.div.a['href'])

        # get beer meta information
        # grab the html and split it into a keyword and value
        brew_info_html = soup_rows[1].find_all('td')[1].div.small
        brew_info = [
            s.split(': ') for s in brew_info_html.text.split(u'\xa0\xa0')
        ]
        keyword_lookup = {
            "RATINGS": "num_ratings",
            "MEAN": "mean_rating",
            "WEIGHTED AVG": "weighted_avg",
            "SEASONAL": "seasonal",
            "CALORIES": "calories",
            "EST. CALORIES": "calories",
            "ABV": "abv",
            "IBU": "ibu",
        }
        # match the data pulled from the brew info and match it to they keyword
        # in the lookup table
        for meta_name, meta_data in brew_info:
            match = keyword_lookup.get(meta_name.strip())
            if match == "mean":
                meta_data = meta_data[:meta_data.find("/")]
            elif match == "abv":
                meta_data = meta_data[:-1]
            elif not match:
                continue
            # convert to float if possible
            try:
                if match == "num_ratings":
                    meta_data = int(meta_data)
                else:
                    meta_data = float(meta_data)
            except ValueError:
                pass
            setattr(self, match, meta_data)

        info = soup_rows[1].tr.find_all('td')

        # get basic brewery information
        brewery_info = info[1].find('div').contents
        brewery_urls = brewery_info[0].findAll('a')
        brewery = brewery_urls[0]
        brewed_at = None
        if len(brewery_urls) == 2:
            brewed_at = brewery_urls[1]
        if brewery:
            self.brewery = brewery.text.strip()
            self.brewery_url = brewery.get('href')
        if brewed_at:
            self.brewed_at = brewed_at.text.strip()
            self.brewed_at_url = brewed_at.get('href')

        # get ratings
        ratings = info[0].findAll('div')
        if len(ratings) > 3:
            self.overall_rating = ratings[1].contents[2]
            self.style_rating = ratings[3].contents[0]

        # get the beer style
        if brewery_info[3]:
            self.style = brewery_info[3].text.strip()

        # get the beer country
        if ',' in brewery_info[5]:
            # Non-USA addresses
            self.brewery_country = brewery_info[5].split(',')[1].strip()
        else:
            # USA addresses
            self.brewery_country = brewery_info[8].strip()

        # get the beer description
        description = soup_rows[1].find_all('td')[1].find(
            'div',
            style=('border: 1px solid #e0e0e0; background: #fff; '
                   'padding: 14px; color: #777;'))
        if 'no commercial description' not in description.text.lower():
            # strip ads
            [s.extract() for s in description('small')]
            self.description = ' '.join([s for s in description.strings
                                         ]).strip()

        # get name
        self.name = soup_rows[0].find_all('td')[1].h1.text.strip()
        self._has_fetched = True

        return self
コード例 #7
0
ファイル: models.py プロジェクト: fpierfed/ratebeer
    def _populate(self):
        if not self.id:
            self.id = self.url.split('/')[-2]

        data = [
                 {"operationName":"beer",
                  "variables":{"beerId":self.id},
                  "query":"query beer($beerId: ID!) { \n info: beer(id: $beerId) { \n id \n name \n description \n style { \n id \n name \n glasses { \n id \n name \n __typename \n } \n __typename \n } \n styleScore \n overallScore \n averageRating \n abv \n ibu \n calories \n brewer { \n id \n name \n __typename \n } \n ratingCount \n isRetired \n isUnrateable \n seasonal \n labels \n availability { \n bottle \n tap \n distribution \n __typename \n } \n __typename \n } \n} \n"},
                 # {"operationName":"beerReviews",
                 #  "variables":
                 #   {"beerId":self.id,
                 #    "order":"RECENT",
                 #    "first":10
                 #   },
                 #  "query":"query beerReviews($beerId: ID!, $authorId: ID, $order: ReviewOrder, $after: ID) { \n beerReviewsArr: beerReviews(beerId: $beerId, authorId: $authorId, order: $order, after: $after) { \n items { \n id \n comment \n score \n scores { \n appearance \n aroma \n flavor \n mouthfeel \n overall \n __typename \n } \n author { \n id \n username \n reviewCount \n __typename \n } \n checkin { \n id \n place { \n name \n city \n state { \n name \n __typename \n } \n country { \n name \n __typename \n } \n __typename \n } \n __typename \n } \n createdAt \n updatedAt \n __typename \n } \n totalCount \n last \n __typename \n } \n} \n"},
                 {"operationName":"beerByAlias",
                  "variables":{"aliasId":self.id},
                  "query":"query beerByAlias($aliasId: ID!) {\n beerByAlias(aliasId: $aliasId) {\n id\n name \n overallScore \n __typename \n } \n } \n"},
                 {"operationName":"tagDisplay",
                  "variables":{"beerId":self.id},
                  "query":"query tagDisplay($beerId: ID!, $first: Int) { \n tagDisplayArr: beerTags(beerId: $beerId, first: $first) { \n items { \n id \n urlName: plain \n __typename \n } \n __typename \n } \n} \n"
                 }
                ]

        request = requests.post(
            "https://beta.ratebeer.com/v1/api/graphql/"
           ,data=json.dumps(data)
           ,headers={"content-type": "application/json"}
        )

        try:
            results = json.loads(request.text)
        except:
            raise rb_exceptions.JSONParseException(self.id)

        beer_data = results[0]['data']['info']

        if beer_data == None:
            raise rb_exceptions.PageNotFound(self.id)

        alias_data = results[1]['data']['beerByAlias']

        if alias_data != None:
            # Resolve the alias: we have all the information. We decide to just
            # return the alias and not the original beer from the query.
            self.name = alias_data['name']
            self.id = alias_data['id']
            self.url = '/beer/{0}/{1}/'.format(
                self.name.replace(' ', '-').lower(),
                self.id
            )
            return self._populate()

        tag_data = results[2]['data']['tagDisplayArr']['items']

        self.name = beer_data['name']
        self.brewery = Brewery('/brewers/{0}/{1}/'.format(re.sub('[/ ]','-',beer_data['brewer']['name'].lower()),beer_data['brewer']['id']))
        self.brewery.name = beer_data['brewer']['name']
        self.brewed_at = None #no longer supported
        self.overall_rating = self._format(beer_data['overallScore'])
        self.style_rating = self._format(beer_data['styleScore'])
        self.style = beer_data['style']['name']
        self.style_url = "/beerstyles/{0}/{1}/".format(re.sub('/','-',self.style.lower()), beer_data['style']['id'])
        self.img_url = "https://res.cloudinary.com/ratebeer/image/upload/w_152,h_309,c_pad,d_beer_img_default.png,f_auto/beer_{0}".format(self.id)
        self.num_ratings = self._format(beer_data['ratingCount'])
        self.mean_rating = self._format(beer_data['averageRating'])
        self.weighted_avg = None # does not appear to exist anymore
        if(beer_data['seasonal'] != 'UNKNOWN'):
            self.seasonal = beer_data['seasonal']
        else:
            self.seasonal = None
        self.ibu = self._format(beer_data['ibu'])
        self.calories = self._format(beer_data['calories'])
        self.abv = self._format(beer_data['abv'])
        self.retired = beer_data['isRetired']

        if beer_data['description']:
            self.description = re.sub(r'\x92', '\'', beer_data['description'])
        else:
            self.description = None
        if tag_data:
            self.tags = [t['urlName'] for t in tag_data]
        else:
            self.tags = None

        self._has_fetched = True

        return self
コード例 #8
0
ファイル: models.py プロジェクト: parryc/record_beer
    def _populate(self):
        if not self.id:
            self.id = self.url.split("/")[-2]

        data = [
            {
                "operationName":
                "beer",
                "variables": {
                    "beerId": self.id
                },
                "query":
                "query beer($beerId: ID!) { \n info: beer(id: $beerId) { \n id \n name \n description \n style { \n id \n name \n glasses { \n id \n name \n __typename \n } \n __typename \n } \n styleScore \n overallScore \n averageRating \n abv \n ibu \n calories \n brewer { \n id \n name \n country \n { \n code \n name \n __typename \n }  \n __typename \n } \n ratingCount \n isRetired \n isUnrateable \n seasonal \n labels \n availability { \n bottle \n tap \n distribution \n __typename \n } \n __typename \n } \n} \n",
            },
            # {"operationName":"beerReviews",
            #  "variables":
            #   {"beerId":self.id,
            #    "order":"RECENT",
            #    "first":10
            #   },
            #  "query":"query beerReviews($beerId: ID!, $authorId: ID, $order: ReviewOrder, $after: ID) { \n beerReviewsArr: beerReviews(beerId: $beerId, authorId: $authorId, order: $order, after: $after) { \n items { \n id \n comment \n score \n scores { \n appearance \n aroma \n flavor \n mouthfeel \n overall \n __typename \n } \n author { \n id \n username \n reviewCount \n __typename \n } \n checkin { \n id \n place { \n name \n city \n state { \n name \n __typename \n } \n country { \n name \n __typename \n } \n __typename \n } \n __typename \n } \n createdAt \n updatedAt \n __typename \n } \n totalCount \n last \n __typename \n } \n} \n"},
            {
                "operationName":
                "beerByAlias",
                "variables": {
                    "aliasId": self.id
                },
                "query":
                "query beerByAlias($aliasId: ID!) {\n beerByAlias(aliasId: $aliasId) {\n id\n name \n overallScore \n __typename \n } \n } \n",
            },
            {
                "operationName":
                "tagDisplay",
                "variables": {
                    "beerId": self.id
                },
                "query":
                "query tagDisplay($beerId: ID!, $first: Int) { \n tagDisplayArr: beerTags(beerId: $beerId, first: $first) { \n items { \n id \n urlName: plain \n __typename \n } \n __typename \n } \n} \n",
            },
        ]

        request = requests.post(
            "https://beta.ratebeer.com/v1/api/graphql/",
            data=json.dumps(data),
            headers={"content-type": "application/json"},
        )

        try:
            results = json.loads(request.text)
        except:
            raise rb_exceptions.JSONParseException(self.id)

        beer_data = results[0]["data"]["info"]

        if beer_data == None:
            raise rb_exceptions.PageNotFound(self.id)

        alias_data = results[1]["data"]["beerByAlias"]

        if alias_data != None:
            raise rb_exceptions.AliasedBeer(self.id, alias_data["id"])

        tag_data = results[2]["data"]["tagDisplayArr"]["items"]

        self.name = beer_data["name"]
        # self.brewery = Brewery(
        #     "/brewers/{0}/{1}/".format(
        #         re.sub("[/ ]", "-", beer_data["brewer"]["name"].lower()),
        #         beer_data["brewer"]["id"],
        #     )
        # )
        brewery_country = beer_data["brewer"]["country"]["name"]
        if brewery_country == "United States":
            brewery_country = "USA"
        self.brewery_country = brewery_country
        self.brewery_name = beer_data["brewer"]["name"]
        self.brewed_at = None  # no longer supported
        self.overall_rating = self._format(beer_data["overallScore"])
        self.style_rating = self._format(beer_data["styleScore"])
        self.style = beer_data["style"]["name"]
        self.style_url = "/beerstyles/{0}/{1}/".format(
            re.sub("/", "-", self.style.lower()), beer_data["style"]["id"])
        self.img_url = "https://res.cloudinary.com/ratebeer/image/upload/w_152,h_309,c_pad,d_beer_img_default.png,f_auto/beer_{0}".format(
            self.id)
        self.num_ratings = self._format(beer_data["ratingCount"])
        self.mean_rating = self._format(beer_data["averageRating"])
        self.weighted_avg = None  # does not appear to exist anymore
        if beer_data["seasonal"] != "UNKNOWN":
            self.seasonal = beer_data["seasonal"]
        else:
            self.seasonal = None
        self.ibu = self._format(beer_data["ibu"])
        self.calories = self._format(beer_data["calories"])
        self.abv = self._format(beer_data["abv"])
        self.retired = beer_data["isRetired"]
        self.description = re.sub(r"\x92", "'", beer_data["description"])
        if tag_data:
            self.tags = [t["urlName"] for t in tag_data]
        else:
            self.tags = None

        self._has_fetched = True

        return self