def task_initial(self, grab, task): self.logger.debug('[{}] Initial url: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): self.logger.fatal('[{}] Err task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: items = grab.doc.select( '//div[contains(@class, "pagination")]//a[contains(@href, "{}")]' .format(Config.get('SITE_PAGE_PARAM'))) max_page = get_max_page(items, 0, -1) self.logger.info('[{}] Task: {}, max_page: {}'.format( task.name, task.url, max_page)) url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM')) for p in range(0, max_page + 1): url = url_gen.get_page(p) yield Task('parse_page', url=url, priority=90) except Exception as e: self._process_error(grab, task, e) self.logger.info('[{}] Tasks added...'.format(task.name))
def task_parse_items(self, grab, task): self.logger.info('[{}] Start: {}'.format(task.name, task.url)) if self._check_body_errors(grab, task): if task.task_try_count < self.err_limit: self.logger.error( '[{}] Restart task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) yield Task('parse_items', url=task.url, priority=105, task_try_count=task.task_try_count + 1, raw=True) else: self.logger.error( '[{}] Skip task with url {}, attempt {}'.format( task.name, task.url, task.task_try_count)) return try: # parse pagination numbers if not task.get('d_skip_page_check'): items = grab.doc.select('//a[contains(@href, "{}")]'.format( Config.get('SITE_PAGE_PARAM'))) max_page = get_max_page(items, 1) self.logger.info('[{}] Find max page: {}'.format( task.name, max_page)) url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM')) # self-execute from 2 page (if needed) for p in range(2, max_page + 1): url = url_gen.get_page(p) yield Task('parse_items', url=url, priority=100, d_skip_page_check=True, raw=True) # parse items items_list = grab.doc.select( '//div[@class="cart_table"]/div/div/table/tbody/tr') for index, row in enumerate(items_list): try: # NAME item_name = row.select( './td[1]//div[@class="description"]/div/a').text( ).strip() # UNIT unit = row.select('./td[2]').text().strip() if unit == '': unit = 'ед.' # PRICE price_raw = row.select( './td[6]//meta[@itemprop="lowprice"]').attr('content') match = Ree.float.match(price_raw) # check & fix if not match: self.logger.warning( '[{}] Skip item, because price is {} (line: {})'. format(task.name, price_raw, index)) continue price = match.groupdict()['price'].replace(',', '.') # COUNT count = row.select('./td[5]') count_text = count.text().strip() # case 1: string line if count_text == 'распродано': item_count = self.const_price_on_request item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 2: string line elif count_text == 'под заказ': item_count = self.const_stock_zero item_place = self.const_default_place # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'.format( task.name, index, task.url)) self.result.append({ 'name': item_name, 'count': item_count, 'unit': unit, 'price': price, 'place': item_place }) # case 3 else: count_rows = count.select( './/div[@class="layer_info"]/table/tbody/tr') for count_row in count_rows: item_place = count_row.select( './td[1]').text().strip() item_count = 0 # add stock place_count_stock = count_row.select( './td[1]').text().strip() if Ree.float.match(place_count_stock): item_count += float(place_count_stock) # add expo place_count_expo = count_row.select( './td[2]').text().strip() if Ree.float.match(place_count_expo): item_count += float(place_count_expo) if item_count > 0: # OUTPUT self.logger.debug( '[{}] Item added, index {} at url {}'. format(task.name, index, task.url)) self.result.append({ 'name': item_name, # 3.140 -> 3.14; 3.0 -> 3 'count': '{0:g}'.format(item_count), 'unit': unit, 'price': price, 'place': item_place }) except IndexError as e: self.logger.warning('[{}] Skip item: {}, {}'.format( task.name, type(e).__name__, task.url)) except Exception as e: self._process_error(grab, task, e) finally: self.logger.info('[{}] Finish: {}'.format(task.name, task.url))