def _parseListPage(self, pool, queue, url, name, base=False): utils.log('[%s] parsing list page %s (%s)' % (self, name, url)) try: soup = utils.getSoup(url) except: #utils.printException() utils.log("[%s] error downloading page %s (%s)" % (self, name, url)) return results = soup.findAll('td', {'class' : 'summary'}) for result in results: entity = Entity() entity.subcategory = "book" entity.nytimes = {} title = result.find('span', {'class' : 'bookName'}).getText().strip().title() if title.endswith(','): title = title[0:-1] entity.title = title details = result.getText(separator='___') details_match = self.details_re.match(details) if details_match: details_match = details_match.groups() entity.author = details_match[0] entity.publisher = details_match[1] entity.desc = details_match[2] key = (entity.title, entity.author) if key in self.seen: continue self.seen.add(key) self._output.put(entity)
def _parse_dump(self, filepath): f = gzip.open(filepath, 'rb') context = iter(etree.iterparse(f, events=("start", "end"))) event, root = context.next() offset = 0 count = 0 # loop through XML and parse each product element as a book Entity for event, elem in context: if event == "end" and elem.tag == "product" and elem.get('product_id') is not None: root.clear() if offset < Globals.options.offset: offset += 1 continue if Globals.options.limit and count >= Globals.options.limit: break try: #assert 'books' == elem.find('.//primary').text.lower() #assert 'USD' == elem.find('price').get('currency') #assert float(elem.find('price').find('retail').text) >= 0.0 entity = Entity() entity.subcategory = "book" entity.title = elem.get('name') entity.bid = int(elem.get('product_id')) entity.sku_number = elem.get('sku_number') entity.image = elem.find('.//productImage').text entity.author = elem.find('.//Author').text entity.publisher = elem.find('.//Publisher').text entity.publish_date = elem.find('.//Publish_Date').text isbn = elem.find('.//ISBN').text if isbn is None or len(isbn) <= 0: continue entity.isbn = isbn desc = elem.find('description') is_english = 'nglish' in etree.tostring(desc) if not is_english: continue #print etree.tostring(elem, pretty_print=True) #self._globals['books'] = elem #pprint(entity.value) self._output.put(entity) count += 1 # give the downstream consumer threads an occasional chance to work if 0 == (count % 512): time.sleep(0.1) parent = elem.getparent() while True: prev = elem.getprevious() if prev is None: break parent.remove(prev) elem.clear() except Exception, e: utils.printException()