コード例 #1
0
ファイル: ZagatCrawler.py プロジェクト: Stamped/Stamped
 def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href):
     utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href))
     
     try:
         soup = utils.getSoup(href)
     except:
         utils.printException()
         utils.log("[%s] error downloading page %s" % (self, href))
         return
     
     # parse the address for the current restaurant
     addr   = soup.find('div', {'class' : 'address'})
     street = addr.find('span', {'class' : 'street'}).getText().strip()
     geo    = addr.find('span', {'class' : 'geo'}).getText().strip()
     
     address = "%s, %s" % (street, geo)
     
     # add the current restaurant to the output for this crawler
     entity = Entity()
     entity.subcategory = "restaurant"
     entity.title   = restaurant_name
     entity.address = address
     entity.sources.zagat = {
         'zurl' : self.base + href, 
     }
     
     #self._globals['soup'] = soup
     # parse cuisine
     header = soup.find('div', {'id' : "block-zagat_restaurants-14"})
     if header is not None:
         header = header.find('ul').find('li', {'class' : 'first'})
         
         if header is not None:
             entity.cuisine = header.getText()
     
     # parse website
     site = soup.find('span', {'class' : 'website'})
     if site is not None:
         site = site.find('a')
         
         if site is not None:
             entity.site = site.get('href')
     
     # parse preview image
     img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'})
     if img is not None:
         img = img.find('img')
         
         if img is not None:
             entity.image = img.get('src')
     
     self._output.put(entity)
コード例 #2
0
 def _parseRestaurantPage(self, pool, queue, url, name, base=False):
     utils.log('[%s] parsing restaurant page %s (%s)' % (self, name, url))
     
     try:
         soup = utils.getSoup(url)
     except:
         #utils.printException()
         utils.log("[%s] error downloading page %s (%s)" % (self, name, url))
         return
     
     content = soup.find('div', { 'id' : 'content'})
     
     if content is None:
         return
     
     entity = Entity()
     entity.title = content.find('h1').getText()
     entity.subcategory = "restaurant"
     entity.seattletimes = {}
     
     details = content.find('div', {'id' : 'edbtext'})
     desc    = details.find('p').getText()
     if desc is not None:
         entity.desc = desc
     
     details = details.findAll('p', {'class' : 'list'})
     address = details[0].renderContents().strip().replace('<br />', '')
     address = re.sub('[ \n\t]+', ' ', address)
     entity.address = address
     
     if len(details) > 1:
         site = details[1].get('href')
         if site is not None:
             entity.site = site
     
     if len(details) > 2:
         hoursOfOperation = details[2].getText()
         if hoursOfOperation is not None:
             entity.hoursOfOperation = hoursOfOperation
     
     key = (entity.title, entity.address)
     if key in self.seen or '(closed)' in entity.title.lower():
         return
     
     self.seen.add(key)
     self._output.put(entity)