コード例 #1
0
def collectCountryBrewers(url_base, countryurl, countryname, proxies):
    brewers = OrderedDict()
    try:
        req = requests.get(countryurl, proxies=proxies)
    except Exception as err:
        print('Error (BeerBrewers) request')
        print(' ' * 4, countryurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (BeerBrewers) soup:', countryurl.encode(), err)
        else:
            for a in soup.find_all(
                    TAG_A, href=re.compile('/brewers/[\W\w\d]+/[\d]+/')):
                try:
                    url_ = url_base + a[ATTRIB_HREF]
                    name = unicode2ascii(a.text)
                except Exception as err:
                    print('Error (BeerBrewers) iterator name:', err)
                    continue
                else:
                    try:
                        row = [name] + [
                            unicode2ascii(td.text) for td in a.parent.parent
                        ]
                    except Exception as err:
                        print('Error (BeerBrewers) iterator row:', err)
                        row = [
                            name,
                        ] + [str()] * 5
                    finally:
                        brewers[url_] = tuple(row)
    return countryurl, countryname, brewers
コード例 #2
0
def collectBeerData(beerurl, beername, image_basepath, proxies):
    beerData = OrderedDict()
    
    beerData[BeerGiumGrabber.STR_SHORT_DESCRIPTION] = str()
    beerData[BeerGiumGrabber.STR_MORE_INFO] = str()
    beerData[BeerGiumGrabber.STR_TABLE_INFO] = []
    beerData[BeerGiumGrabber.STR_BEER_IMAGES] = []
    
    try:
        req = requests.get(beerurl, proxies=proxies)
    except Exception as err:
        print('Error (BeerPages) request:')
        print(' '*4, beerurl.encode())
        print(' '*4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (BeerPages) soup:')
            print(' '*4, beerurl.encode())
            print(' '*4, err)
        else:
            images = []
            for a in soup.find_all(TAG_A, {'class': 'jqzoom'}):
                basename = os.path.basename(a[ATTRIB_HREF])
                image_path = os.path.join(image_basepath, basename)
                images.append((a[ATTRIB_HREF], image_path))
            for div in soup.find_all(TAG_DIV, {'id': 'short_description_content'}):
                beerData[BeerGiumGrabber.STR_SHORT_DESCRIPTION] = unicode2ascii(div.text)
                break
            for div in soup.find_all(TAG_DIV, {'id': 'more_info_sheets'}):
                beerData[BeerGiumGrabber.STR_MORE_INFO] = unicode2ascii(div.text)
                break
            for table in soup.find_all(TAG_TABLE, {'class': 'table-data-sheet table-bordered'}):
                for tr in table:
                    r = [unicode2ascii(td.text) for td in tr]
                    beerData[BeerGiumGrabber.STR_TABLE_INFO].append(r)
                break
            for image_url, image_path in images:
                if not os.path.exists(image_path):
                    try:
                        req = requests.get(image_url, proxies=proxies)
                    except Exception as err:
                        print('Error (BeerPages) request:')
                        print(' '*4, image_url.encode())
                        print(' '*4, err)
                    else:
                        if len(req.content) > 0:
                            try:
                                open(image_path, ReadWriteFile.WRITE_BINARY_NEW).write(req.content)
                            except Exception as err:
                                print('Error (BeerPages) BeerImage save:')
                                print(' '*4, err)
                            else:
                                beerData[BeerGiumGrabber.STR_BEER_IMAGES].append((image_url, image_path))
    return (beerurl, beername, beerData)
コード例 #3
0
    def beerRows(self, print_=False):

        FIELDS = ['name', 'type']
        ROWS = []

        INFO = [('Accessory:', str()), ('Alcohol:', str()), ('Awards:', str()),
                ('Brewery:', str()), ('Color:', str()), ('Country:', str()),
                ('Hop:', str()), ('IBU:', str()), ('Malt:', str()),
                ('Plato:', str()), ('Rating:', str()), ('Recipes:', str()),
                ('Served:', str()), ('Type:', str())]

        FIELDS.extend([v for v, _ in INFO])
        FIELDS.append('Description')
        if print_:
            print(chr(9).join(FIELDS))
        FIELDS.append(self.STR_BEER_IMAGES)
        for _, (beer_type, beers) in self.beerTypes.items():
            for beer_url, beer_name in beers.items():
                row = [beer_name, beer_type]
                beerData = self.beerData[(beer_url, beer_name)]
                info = beerData[self.STR_TABLE_INFO]
                for i, r in enumerate(info):
                    if i == 0:
                        _, *r = r
                        d = OrderedDict(INFO)
                        ir = len(r)
                        for iv, v in enumerate(r, 1):
                            if v.endswith(':'):
                                if iv < ir:
                                    if (not r[iv].endswith(':')):
                                        d[v] = unicode2ascii(r[iv])
                        row.extend(d.values())
                    elif i == 1:
                        row.append(unicode2ascii(chr(10).join(r)))
                    else:
                        print('pass row:', r)
                if len(info) == 0:
                    d = OrderedDict(INFO)
                    row.extend(d.values())
                    row.append(str())
                elif len(info) == 1:
                    row.append(str())
                if print_:
                    print(chr(9).join(map(str, row)))
                images = [
                    image_path
                    for _, image_path in beerData[self.STR_BEER_IMAGES]
                ]
                if not images:
                    print(beer_name, beer_type, beer_url)

                row.append(images)
                ROWS.append(row)

        return FIELDS, ROWS
コード例 #4
0
    def runBeers(self):
        if (not self.update_all) and os.path.exists(self.beerPath):
            with open(self.beerPath, ReadWriteFile.READ_BINARY) as fh:
                self.beerTypes.update(pickle.load(fh))
                self.beerPages.update(pickle.load(fh))
            print('Beers:', len(self.beerPages))
            return
        if (not self.update_all):
            return
        try:
            req = requests.get(self._url_beer_types, proxies=self.proxies)
        except Exception as err:
            print('Error (Beers) request')
            print(' ' * 4, self._url_beer_types.encode())
            print(' ' * 4, err)
        else:
            try:
                soup = BeautifulSoup(req.content, LXML)
            except Exception as err:
                print('Error (Beers) soup:', err)
            else:
                for a in soup.find_all(TAG_A, {'class': 'footer'}):
                    name = unicode2ascii(a.text)
                    url = chr(47).join([self._url_base, a[ATTRIB_HREF]])
                    self.beerTypes[url] = [name, OrderedDict()]

        for type_url, (name, beers) in self.beerTypes.items():
            try:
                req = requests.get(type_url, proxies=self.proxies)
            except Exception as err:
                print('Error (Beers) request')
                print(' ' * 4, type_url.encode())
                print(' ' * 4, err)
            else:
                try:
                    soup = BeautifulSoup(req.content, LXML)
                except Exception as err:
                    print('Error (Beers) soup:', err)
                else:
                    for a in soup.find_all(TAG_A, {'class': 'beer'}):
                        beer_name = unicode2ascii(a.text)
                        beer_url = chr(47).join(
                            [self._url_base, a[ATTRIB_HREF]])
                        if 'UBID=' in beer_url:
                            beers[beer_url] = beer_name
                            self.beerPages[beer_url] = beer_name

        print('Beers:', len(self.beerPages))
        with open(self.beerPath, ReadWriteFile.WRITE_BINARY_NEW) as fh:
            pickle.dump(self.beerTypes, fh)
            pickle.dump(self.beerPages, fh)
コード例 #5
0
    def runCountries(self):

        if (not self.update_all) and os.path.exists(self.countryPath):
            with open(self.countryPath, ReadWriteFile.READ_BINARY) as fh:
                self.countryPages.extend(pickle.load(fh))
            print('Countries:', len(self.countryPages))
            return
        if (not self.update_all):
            return
        try:
            req = requests.get(self._url_country, proxies=self.proxies)
        except Exception as err:
            print('Error (Countries) request')
            print(' ' * 4, self._url_country.encode())
            print(' ' * 4, err)
        else:
            try:
                soup = BeautifulSoup(req.content, LXML)
            except Exception as err:
                print('Error (Countries) soup:', err)
            else:
                for a in soup.find_all(
                        TAG_A,
                        href=re.compile('/place/directory/0/[\W\w\d]+/')):
                    name = unicode2ascii(a.text)
                    url = self._url_base + a[ATTRIB_HREF]
                    num, = re.findall('(\d+)', name)
                    name = name.replace('(%s)' % num, str())
                    name = name.strip()
                    self.countryPages.append((url, name, int(num)))

        print('Countries:', len(self.countryPages))
        with open(self.countryPath, ReadWriteFile.WRITE_BINARY_NEW) as fh:
            pickle.dump(self.countryPages, fh)
コード例 #6
0
def collectBeers(url_base, breweryurl, breweryname, proxies):
    beerUrls = OrderedDict()
    brewerData = OrderedDict()
    url = breweryurl + '?view=beers&show=all'
    try:
        req = requests.get(url, proxies=proxies)
    except Exception as err:
        print('Error (Beers) request:')
        print(' ' * 4, breweryurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (Beers) soup:')
            print(' ' * 4, breweryurl.encode())
            print(' ' * 4, err)
        else:
            for a in soup.find_all(
                    TAG_A, href=re.compile('/beer/profile/[\d]+/[\d]+')):
                try:
                    beer_url = url_base + a[ATTRIB_HREF]
                    row = [unicode2ascii(td.text) for td in a.parent.parent]
                    beer_name = row[0]
                except Exception as err:
                    print('Error (Beers) profile:')
                    print(' ' * 4, err)
                else:
                    beerUrls[(beer_url, beer_name)] = row

            for b in soup.find_all(TAG_B):
                text = unicode2ascii(b.text)
                if text == 'BEER AVG':
                    data = b.parent.text.lstrip(text).encode()
                    brewerData[text] = data
                elif text == 'PLACE INFO':
                    data = b.parent.text.lstrip(text).encode()
                    brewerData[text] = data
                elif text == 'BEER STATS':
                    data = b.parent.text.lstrip(text).encode()
                    brewerData[text] = data
    return (breweryurl, breweryname, beerUrls, brewerData)
コード例 #7
0
def collectBeers(url_base, breweryurl, breweryname, proxies):
    beerUrls = OrderedDict()
    soup = None
    req = None
    try:
        req = requests.get(breweryurl, proxies=proxies)
    except Exception as err:
        print('Error (BeerBrewers) request')
        print(' ' * 4, breweryurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (BeerBrewers) soup:', breweryurl.encode(), err)
        else:
            for tr in soup.find_all(TAG_TR, {'valign': 'middle'}):
                try:
                    beer_name = unicode2ascii(tr.td.a.text)
                    beer_url = url_base + tr.td.a[ATTRIB_HREF]
                except Exception as err:
                    print('Error (BeerBrewers) iterator beer name:', err)
                    continue
                else:
                    try:
                        row = [
                            beer_name,
                        ] + [
                            unicode2ascii(td)
                            if isinstance(td, str) else unicode2ascii(td.text)
                            for td in tr
                        ][1:]
                    except Exception as err:
                        print('Error (BeerBrewers) iterator row:', err)
                        row = [
                            beer_name,
                        ] + [str()] * 5
                    finally:
                        beerUrls[beer_url] = tuple(row)
    return (breweryurl, breweryname, beerUrls)
コード例 #8
0
 def runBelgiumBeers(self):
     
     if (not self.update_all) and os.path.exists(self.beerPath):
         with open(self.beerPath, ReadWriteFile.READ_BINARY) as fh:
             self.beerPages.update( pickle.load(fh) )
         print('Beers:', len(self.beerPages))
         return
     if (not self.update_all):
         return
     numPages = 0
     try:
         req = requests.get(self._url_beer_belgium, proxies=self.proxies)
     except Exception as err:
         print('Error (Beers) request')
         print(' '*4, self._url_country.encode())
         print(' '*4, err)
     else:
         try:
             soup = BeautifulSoup(req.content, LXML)
         except Exception as err:
             print('Error (Beers) soup:', err)
         else:
             pages = []
             for a in soup.find_all(TAG_A, href=re.compile('/en/360-belgium\?p=[\d]+')):
                 pages.append(int(a[ATTRIB_HREF].split(chr(61), 1)[1]))
             numPages = max(pages)
             for a in soup.find_all(TAG_A, {'class': 'product-name'}):
                 name = unicode2ascii(a.text)
                 self.beerPages[a[ATTRIB_HREF]] = [name, str(), str()]
                 for p in a.parent.parent.find_all(TAG_P, {'class': 'pro_list_manufacturer'}):
                     self.beerPages[a[ATTRIB_HREF]][1] = unicode2ascii(p.text)
                 for span in a.parent.parent.find_all(TAG_SPAN, {'itemprop': 'price'}):
                     self.beerPages[a[ATTRIB_HREF]][2] = unicode2ascii(span.text)
     
     print(numPages, len(self.beerPages))
     
     for pageId in range(2, numPages+1):
         url = '{}/en/360-belgium?p={}'.format(self._url_base, pageId)
         try:
             req = requests.get(url, proxies=self.proxies)
         except Exception as err:
             print('Error (Beers) request')
             print(' '*4, self._url_country.encode())
             print(' '*4, err)
         else:
             try:
                 soup = BeautifulSoup(req.content, LXML)
             except Exception as err:
                 print('Error (Beers) soup:', err)
             else:
                 for a in soup.find_all(TAG_A, {'class': 'product-name'}):
                     name = unicode2ascii(a.text)
                     self.beerPages[a[ATTRIB_HREF]] = [name, str(), str()]
                     for p in a.parent.parent.find_all(TAG_P, {'class': 'pro_list_manufacturer'}):
                         self.beerPages[a[ATTRIB_HREF]][1] = unicode2ascii(p.text)
                     for span in a.parent.parent.find_all(TAG_SPAN, {'itemprop': 'price'}):
                         self.beerPages[a[ATTRIB_HREF]][2] = unicode2ascii(span.text)
     
     print('Beers:', len(self.beerPages))
     with open(self.beerPath, ReadWriteFile.WRITE_BINARY_NEW) as fh:
         pickle.dump(self.beerPages, fh)
コード例 #9
0
    def runCountryBrewers(self):
        global TASK_FINISHED
        global TASK_TOTAL
        TASK_FINISHED = 0
        TASK_TOTAL = 0
        if os.path.exists(self.countryBreweryPath):
            with open(self.countryBreweryPath,
                      ReadWriteFile.READ_BINARY) as fh:
                self.countryBreweryPages.update(pickle.load(fh))
            print(
                'Countries-Breweries:',
                sum(
                    len(self.countryBreweryPages[c])
                    for c in self.countryBreweryPages))
            return
        if not self.update_all:
            return
        pages = OrderedDict()

        try:
            req = requests.get(self._url_country_brewers, proxies=self.proxies)
        except Exception as err:
            print('Error (BeerBrewers) request')
            print(' ' * 4, self._url_country_brewers.encode())
            print(' ' * 4, err)
        else:
            try:
                soup = BeautifulSoup(req.content, LXML)
            except Exception as err:
                print('Error (BeerBrewers) soup:', err)
            else:
                for a in soup.find_all(
                        TAG_A, href=re.compile('/breweries/[\W\w\d]+/[\d]+/')):
                    url_ = self._url_base + a[ATTRIB_HREF]
                    name = unicode2ascii(a.text)
                    pages[url_] = name

        print('Countries:', len(pages))
        progress = len(pages)
        pool = Pool(processes=self.THREAD_NUMBER)
        for step, country_url in enumerate(pages, 1):
            country_name = pages[country_url]
            print('run page', step, '-', progress, ':', country_name,
                  country_url.encode())
            pool.apply_async(collectCountryBrewers,
                             args=(self._url_base, country_url, country_name,
                                   self.proxies),
                             callback=resultCountryBrewers)
            TASK_TOTAL += 1
        pool.close()

        print('Total tasks:', TASK_TOTAL)
        while ((TASK_TOTAL != TASK_FINISHED)):
            print('executed:', TASK_FINISHED, 'from:', TASK_TOTAL, 'progress:',
                  TASK_FINISHED / TASK_TOTAL)
            sleep(self.PROGRESS_UPDATE_TIME)
        pool.join()

        print(
            'Countries-Breweries:',
            sum(
                len(self.countryBreweryPages[c])
                for c in self.countryBreweryPages))
        with open(self.countryBreweryPath,
                  ReadWriteFile.WRITE_BINARY_NEW) as fh:
            pickle.dump(self.countryBreweryPages, fh)
コード例 #10
0
def collectBeerPages(beerurl, beername, image_basepath, proxies):

    row = [beername, beerurl, image_basepath, None, None, None, None, 0, None]

    try:
        req = requests.get(beerurl, proxies=proxies)
    except Exception as err:
        print('Error (BeerImage) request:')
        print(' ' * 4, beerurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (BeerImage) soup:')
            print(' ' * 4, beerurl.encode())
            print(' ' * 4, err)
        else:
            image_url = None
            image_path = None
            image_basename = None
            info = None

            for a in soup.find_all(
                    TAG_A, href=re.compile('/beerstyles/[\w\W\d]+/[\d]+')):
                row[8] = unicode2ascii(a.text)
                if row[8] != 'Top By Style':
                    break

            for img in soup.find_all(TAG_IMG, {'itemprop': 'image'}):
                image_url = img[ATTRIB_SRC]
                image_basename = os.path.basename(image_url)
                image_path = os.path.join(image_basepath, image_basename)

            for small in soup.find_all(TAG_SMALL):
                data = unicode2ascii(small.text)
                if 'RATINGS:' in data:
                    info = data
                    break
                elif len(small) > 8:
                    info = data
                    break

            if info is None:
                for a in soup.find_all(
                        'a', href=re.compile('/beer/[\W\w\d]+/[\d]+/')):
                    if b'Proceed to the aliased beer' in a.parent.text.encode(
                    ):
                        info = unicode2ascii(a.parent.text)
                        break

            if info is None and b"we didn't find this beer" in req.content:
                info = unicode2ascii(req.content.decode())

            row[3] = image_url
            row[4] = image_path
            row[5] = info

            if image_url and (os.path.exists(image_path)):
                row[7] = os.stat(image_path).st_size
            elif image_url and (not os.path.exists(image_path)):
                p, _ = os.path.split(image_url)
                p, _ = os.path.split(p)
                img_url = chr(47).join([p, image_basename])
                row[6] = img_url
                try:
                    req = requests.get(img_url, proxies=proxies)
                except Exception as err:
                    print('Error (BeerImage) image:')
                    print(' ' * 4, beerurl.encode())
                    print(' ' * 4, err)
                else:
                    iLen = len(req.content)
                    if iLen > 0:
                        try:
                            open(image_path,
                                 ReadWriteFile.WRITE_BINARY_NEW).write(
                                     req.content)
                        except Exception as err:
                            print('Error (BeerImage) image)save:')
                            print(' ' * 4, beerurl.encode())
                            print(' ' * 4, err)
                        else:
                            row[7] = iLen
    return (beerurl, beername, row)
コード例 #11
0
    def beerRows(self, print_=False):

        STATS = [
            ('Reviews:', -1),
            ('Ratings:', -1),
            ('Avg:', -1),
            ('pDev:', '-1%'),
            ('Wants:', -1),
            ('Gots:', -1),
            ('For Trade:', -1),
        ]

        INFO = [
            ('Brewed by:', str()),
            ('Style:', str()),
            ('Alcohol by volume (ABV):', str()),
            ('Availability:', str()),
            ('Notes / Commercial Description:', str()),
            ('Added by', str()),
        ]

        FIELDS = [
            'brewery',
            'beer',
            'ba-score',
            'ba-desc',
            'ba-reviews',
            'bros-score',
            'bros-desc',
        ]

        FIELDS.extend([n for n, _ in STATS])
        FIELDS.extend([n for n, _ in INFO])
        if print_:
            print(chr(9).join(map(str, FIELDS)))
        FIELDS.append(self.STR_BEER_IMAGES)

        ROWS = []
        iROWS = set()
        beers = OrderedDict()
        beer_number = 0
        for b_url, b_name in self.beerPages:
            beers[b_name] = OrderedDict()
            #print(b_name, b_url.encode())
            for beer_url, beer_name in self.beerPages[(b_url, b_name)]:
                if not beer_url.endswith(chr(47)): continue
                #print(' '*4, beer_name, beer_url.encode())
                beerData = self.beerData[(beer_url, beer_name)]

                ba_score = beerData[self.STR_BA_SCORE]
                ba_score = ba_score.decode().replace(chr(9),
                                                     str()).split(chr(10))
                ba_score = ba_score[ba_score.index(self.STR_BA_SCORE) + 2:]
                ba_score_index = -1
                if ba_score[0].isdigit():
                    ba_score_index = int(ba_score[0])
                elif ba_score[0] == '-':
                    ba_score_index = 0
                review, = re.findall('[\d,.]+', ba_score[-1])

                the_bros = beerData[self.STR_THE_BROS]
                the_bros = the_bros.decode().replace(chr(9),
                                                     str()).split(chr(10))
                the_bros = the_bros[the_bros.index(self.STR_THE_BROS) + 2:]
                the_bros_index = -1
                if the_bros[0].isdigit():
                    the_bros_index = int(the_bros[0])
                elif the_bros[0] == '-':
                    the_bros_index = 0

                beer_stats = beerData[self.STR_BEER_STATS]
                beer_stats = beer_stats.decode().replace(chr(9),
                                                         str()).split(chr(10))
                beer_stats = beer_stats[beer_stats.index(self.STR_BEER_STATS) +
                                        3:]
                stats = OrderedDict(STATS)
                for name in stats:
                    if name in beer_stats:
                        value = beer_stats[beer_stats.index(name) + 1]
                        if not value:
                            value = beer_stats[beer_stats.index(name) + 2]
                        if value == 'NAN%':
                            continue
                        stats[name] = str2val(value)

                beer_info = beerData[self.STR_BEER_INFO]
                beer_info = beer_info.decode().replace(chr(9),
                                                       str()).split(chr(10))
                beer_info = beer_info[beer_info.index(self.STR_BEER_INFO) + 2:]

                info = OrderedDict(INFO)
                names = list(info.keys())

                #print(names)
                #print(beer_info)

                for f, n in zip(names, names[1:] + [None]):
                    s, e = -1, -1
                    for i, v in enumerate(beer_info):
                        if f in v:
                            s = i
                        if n and n in v:
                            e = i + 1
                            break
                    if s > -1 and e > -1:
                        d = chr(32).join(beer_info[s:e]).strip()
                        d = d[d.find(f) + len(f):d.find(n)].strip()
                    elif s > -1 and e == -1:
                        d = chr(32).join(beer_info[s:]).strip()
                        d = d[d.find(f) + len(f):].strip()
                    else:
                        continue
                    if d in ('No notes at this time.', 'not listed'):
                        continue
                    info[f] = unicode2ascii(d)

                row = [
                    b_name,
                    beer_name,
                    ba_score_index,
                    str2val(ba_score[1]),
                    str2val(review),
                    the_bros_index,
                    the_bros[1],
                ]
                row.extend(stats.values())
                row.extend(info.values())
                if print_:
                    print(chr(9).join(map(str, row)))
                id_ = hash(tuple(row))
                if id_ in iROWS:
                    continue
                iROWS.add(id_)
                row.append(beerData[self.STR_BEER_IMAGES])
                ROWS.append(row)
                beer_number += 1

        print('Beers:', beer_number)

        return FIELDS, ROWS
コード例 #12
0
def collectCountryBrewers(url_base, countryurl, countryname, proxies):
    brewers = OrderedDict()
    prefixies = []

    try:
        req = requests.get(countryurl, proxies=proxies)
    except Exception as err:
        print('Error (CountryBrewers) request:')
        print(' ' * 4, countryurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (CountryBrewers) soup:')
            print(' ' * 4, countryurl.encode())
            print(' ' * 4, err)
        else:
            urls = []
            for a in soup.find_all(
                    TAG_A, href=re.compile('/place/list/[\w\W\d]+brewery=Y')):
                urls.append(url_base + a[ATTRIB_HREF])

            for a in soup.find_all(
                    TAG_A, href=re.compile('/place/list/\?city=[\w\W\d]+')):
                urls.append(url_base + a[ATTRIB_HREF])

            for pos, url in enumerate(urls, 1):
                print(pos)
                try:
                    req = requests.get(url, proxies=proxies)
                except Exception as err:
                    print('Error (CountryBrewers) request-1-{}:'.format(pos))
                    print(' ' * 4, url.encode())
                    print(' ' * 4, err)
                else:
                    try:
                        soup = BeautifulSoup(req.text, LXML)
                    except Exception as err:
                        print('Error (CountryBrewers) soup-1-{}:'.format(pos))
                        print(' ' * 4, url.encode())
                        print(' ' * 4, err)
                    else:
                        prefix_s = str()
                        prefix_e = str()

                        indexes = []
                        for a in soup.find_all(
                                TAG_A,
                                href=re.compile('/place/list/\?start=[\d]+')):
                            ids = re.findall('[\d]+', a[ATTRIB_HREF])
                            if ids:
                                prefix_s, prefix_e = a[ATTRIB_HREF].split(
                                    chr(61), 1)
                                prefix_e = prefix_e.lstrip(ids[0])
                                indexes.append(int(ids[0]))
                        if indexes:
                            minIndex = min(indexes)
                            maxIndex = max(indexes)
                            for a in soup.find_all(
                                    TAG_A, href=re.compile('/beer/profile')):
                                brewery_name = unicode2ascii(a.b.text)
                                brewery_url = url_base + a[ATTRIB_HREF]
                                brewers[brewery_url] = brewery_name
                            prefixies.append(
                                (minIndex, maxIndex, prefix_s, prefix_e))
                        else:
                            for a in soup.find_all(
                                    TAG_A, href=re.compile('/beer/profile')):
                                brewery_name = unicode2ascii(a.b.text)
                                brewery_url = url_base + a[ATTRIB_HREF]
                                brewers[brewery_url] = brewery_name

    for minIndex, maxIndex, prefix_s, prefix_e in prefixies:
        for pos, index in enumerate(range(minIndex, maxIndex + 20, 20)):
            url = url_base + prefix_s + '=' + str(index) + prefix_e
            try:
                req = requests.get(url, proxies=proxies)
            except Exception as err:
                print('Error (CountryBrewers) request-2-{}:'.format(pos))
                print(' ' * 4, url.encode())
                print(' ' * 4, err)
            else:
                try:
                    soup = BeautifulSoup(req.text, LXML)
                except Exception as err:
                    print('Error (CountryBrewers) soup-2-{}:'.format(pos))
                    print(' ' * 4, brewery_url.encode())
                    print(' ' * 4, err)
                else:
                    for a in soup.find_all(TAG_A,
                                           href=re.compile('/beer/profile')):
                        brewery_name = unicode2ascii(a.b.text)
                        brewery_url = url_base + a[ATTRIB_HREF]
                        brewers[brewery_url] = brewery_name

    return countryurl, countryname, brewers
コード例 #13
0
def collectBeerPages(beerurl, beername, image_basepath, proxies):
    beerData = OrderedDict()
    beerData[BeerAdvocateGrabber.STR_BEER_IMAGES] = []
    beerData[BeerAdvocateGrabber.STR_BA_SCORE] = bytes()
    beerData[BeerAdvocateGrabber.STR_THE_BROS] = bytes()
    beerData[BeerAdvocateGrabber.STR_BEER_STATS] = bytes()
    beerData[BeerAdvocateGrabber.STR_BEER_INFO] = bytes()
    try:
        req = requests.get(beerurl, proxies=proxies)
    except Exception as err:
        print('Error (BeerPages) request:')
        print(' ' * 4, beerurl.encode())
        print(' ' * 4, err)
    else:
        try:
            soup = BeautifulSoup(req.content, LXML)
        except Exception as err:
            print('Error (BeerPages) soup:')
            print(' ' * 4, beerurl.encode())
            print(' ' * 4, err)
        else:
            for b in soup.find_all(TAG_B):
                text = unicode2ascii(b.text)
                if text == BeerAdvocateGrabber.STR_BA_SCORE:
                    beerData[text] = b.parent.text.lstrip(text).encode()
                elif text == BeerAdvocateGrabber.STR_THE_BROS:
                    beerData[text] = b.parent.text.lstrip(text).encode()
                elif text == BeerAdvocateGrabber.STR_BEER_STATS:
                    beerData[text] = b.parent.text.lstrip(text).encode()
                elif text == BeerAdvocateGrabber.STR_BEER_INFO:
                    beerData[text] = b.parent.text.lstrip(text).encode()

            for img in soup.find_all(TAG_IMG):
                if not '/beers/' in img[ATTRIB_SRC]:
                    continue
                basename = os.path.basename(img[ATTRIB_SRC])
                if basename == 'c_beer_image.gif':
                    continue
                elif basename == 'placeholder-beer.jpg':
                    continue
                image_path = os.path.join(image_basepath, basename)
                if os.path.exists(image_path):
                    beerData[BeerAdvocateGrabber.STR_BEER_IMAGES].append(
                        (img[ATTRIB_SRC], image_path))
                else:
                    try:
                        req = requests.get(img[ATTRIB_SRC], proxies=proxies)
                    except Exception as err:
                        print('Error (BeerPages) BeerImage request:')
                        print(' ' * 4, img[ATTRIB_SRC].encode())
                        print(' ' * 4, err)
                    else:
                        try:
                            open(image_path,
                                 ReadWriteFile.WRITE_BINARY_NEW).write(
                                     req.content)
                        except Exception as err:
                            print('Error (BeerPages) BeerImage save:')
                            print(' ' * 4, err)
                        else:
                            beerData[
                                BeerAdvocateGrabber.STR_BEER_IMAGES].append(
                                    (img[ATTRIB_SRC], image_path))
    return (beerurl, beername, beerData)