def _parseRestaurantPage(self, pool, region_name, city_name, restaurant_name, href): utils.log("[%s] parsing restaurant '%s.%s.%s' (%s)" % (self, region_name, city_name, restaurant_name, href)) try: soup = utils.getSoup(href) except: utils.printException() utils.log("[%s] error downloading page %s" % (self, href)) return # parse the address for the current restaurant addr = soup.find('div', {'class' : 'address'}) street = addr.find('span', {'class' : 'street'}).getText().strip() geo = addr.find('span', {'class' : 'geo'}).getText().strip() address = "%s, %s" % (street, geo) # add the current restaurant to the output for this crawler entity = Entity() entity.subcategory = "restaurant" entity.title = restaurant_name entity.address = address entity.sources.zagat = { 'zurl' : self.base + href, } #self._globals['soup'] = soup # parse cuisine header = soup.find('div', {'id' : "block-zagat_restaurants-14"}) if header is not None: header = header.find('ul').find('li', {'class' : 'first'}) if header is not None: entity.cuisine = header.getText() # parse website site = soup.find('span', {'class' : 'website'}) if site is not None: site = site.find('a') if site is not None: entity.site = site.get('href') # parse preview image img = soup.find('div', {'id' : 'content'}).find('div', {'class' : 'photo'}) if img is not None: img = img.find('img') if img is not None: entity.image = img.get('src') self._output.put(entity)
def _parse_dump(self, filepath): f = gzip.open(filepath, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() offset = 0 count = 0 # loop through XML and parse each product element as a book Entity for event, elem in context: if event == "end" and elem.tag == "product" and elem.get('product_id') is not None: root.clear() if offset < Globals.options.offset: offset += 1 continue if Globals.options.limit and count >= Globals.options.limit: break try: #assert 'books' == elem.find('.//primary').text.lower() #assert 'USD' == elem.find('price').get('currency') #assert float(elem.find('price').find('retail').text) >= 0.0 entity = Entity() entity.subcategory = "book" entity.title = elem.get('name') entity.bid = int(elem.get('product_id')) entity.sku_number = elem.get('sku_number') entity.image = elem.find('.//productImage').text entity.author = elem.find('.//Author').text entity.publisher = elem.find('.//Publisher').text entity.publish_date = elem.find('.//Publish_Date').text isbn = elem.find('.//ISBN').text if isbn is None or len(isbn) <= 0: continue entity.isbn = isbn desc = elem.find('description') is_english = 'nglish' in etree.tostring(desc) if not is_english: continue #print etree.tostring(elem, pretty_print=True) #self._globals['books'] = elem #pprint(entity.value) self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) parent = elem.getparent() while True: prev = elem.getprevious() if prev is None: break parent.remove(prev) elem.clear() except Exception, e: utils.printException()
def _parse_series_page(self, name, url): if '**' in name or 'DUPLICATE' in name or name.startswith('.hack'): return utils.log('[%s] parsing page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return contents = soup.findAll('div', {'id' : 'content'}) header = contents[0] h1 = header.find('h1') title = h1.getText() h1.extract() entity = Entity() # parse basic show info entity.title = title entity.subcategory = 'tv' desc = header.getText().replace('\r\n', '\n') if len(desc) > 5: entity.desc = desc entity.sources.thetvdb_id = self._id_re.match(url).groups()[0] # parse images images = map(lambda img: img.get('src'), soup.findAll('img', {'class' : 'banner'})) types = [ 'posters', 'fanart', 'graphical', ] for image_type in types: filtered_images = filter(lambda img: image_type in img, images) if len(filtered_images) > 0: entity.image = "%s%s" % (self.base, filtered_images[0]) break info = contents[1].find('table').find('table') rows = info.findAll('tr') # parse detailed show info info_map = { 0 : 'original_release_date', 3 : 'air_time', 4 : 'network_name', 5 : 'genre', } for k, k2 in info_map.iteritems(): try: value = rows[k].findAll('td')[1].getText() if len(value) > 0: entity[k2] = value except: utils.printException() pass # parse cast try: actors = "%s%s" % (self.base, contents[-1].findAll('a')[-1].get('href')) actors_soup = utils.getSoup(actors) infotables = actors_soup.findAll('table', {'class' : 'infotable'}) cast = [] for infotable in infotables: text = infotable.find('td').getText(separator='___') match = self._actor_re.match(text) if match is not None: groups = match.groups() cast.append('%s as %s' % (groups[0].strip(), groups[1].strip())) # TODO: record actor images if len(cast) > 0: entity.cast = ', '.join(cast) except: pass # parse seasons try: seasons = "%s%s" % (self.base, contents[2].findAll('a')[-1].get('href')) seasons_soup = utils.getSoup(seasons) rows = seasons_soup.find('table', {'id' : 'listtable'}).findAll('tr')[1:] highest_season = -1 earliest = None latest = None # each row is an episode; loop through each episode, recording the # earliest and latest air date for the show overall and the number # of seasons the show ran for. for row in rows: tds = row.findAll('td') episode = tds[0].getText() match = self._season_re.match(episode) if match is not None: groups = match.groups() season = int(groups[0]) episode = int(groups[1]) if season > highest_season: highest_season = season date = tds[2].getText() match = self._date_re.match(date) if match is not None: year, month, day = match.groups() date = datetime(year=int(year), month=int(month), day=int(day)) if earliest is None or date < earliest: earliest = date if latest is None or date > latest: latest = date if highest_season > 0: entity.num_seasons = highest_season if earliest is not None: entity.earliest_air_date = earliest if latest is not None: entity.latest_air_date = latest except: utils.printException() entity2 = self._thetvdb.lookup(entity.sources.thetvdb_id) if entity2 is not None: if entity2.mpaa_rating is not None: entity.mpaa_rating = entity2.mpaa_rating if entity2.imdb_id is not None: entity.imdb_id = entity2.imdb_id self._output.put(entity)