def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)
def _parseRestaurantPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return content = soup.find('div', { 'id' : 'content'}) if content is None: return entity = Entity() entity.title = content.find('h1').getText() entity.subcategory = "restaurant" entity.seattletimes = {} details = content.find('div', {'id' : 'edbtext'}) desc = details.find('p').getText() if desc is not None: entity.desc = desc details = details.findAll('p', {'class' : 'list'}) address = details[0].renderContents().strip().replace('<br />', '') address = re.sub('[ \n\t]+', ' ', address) entity.address = address if len(details) > 1: site = details[1].get('href') if site is not None: entity.site = site if len(details) > 2: hoursOfOperation = details[2].getText() if hoursOfOperation is not None: entity.hoursOfOperation = hoursOfOperation key = (entity.title, entity.address) if key in self.seen or '(closed)' in entity.title.lower(): return self.seen.add(key) self._output.put(entity)