def task_initial(self, grab, task):
        self.logger.debug('[{}] Initial url: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            self.logger.fatal('[{}] Err task with url {}, attempt {}'.format(
                task.name, task.url, task.task_try_count))
            return

        try:
            items = grab.doc.select(
                '//div[contains(@class, "pagination")]//a[contains(@href, "{}")]'
                .format(Config.get('SITE_PAGE_PARAM')))
            max_page = get_max_page(items, 0, -1)

            self.logger.info('[{}] Task: {}, max_page: {}'.format(
                task.name, task.url, max_page))

            url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM'))

            for p in range(0, max_page + 1):
                url = url_gen.get_page(p)
                yield Task('parse_page', url=url, priority=90)

        except Exception as e:
            self._process_error(grab, task, e)

        self.logger.info('[{}] Tasks added...'.format(task.name))
Exemple #2
0
    def task_parse_items(self, grab, task):
        self.logger.info('[{}] Start: {}'.format(task.name, task.url))

        if self._check_body_errors(grab, task):
            if task.task_try_count < self.err_limit:
                self.logger.error(
                    '[{}] Restart task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))
                yield Task('parse_items',
                           url=task.url,
                           priority=105,
                           task_try_count=task.task_try_count + 1,
                           raw=True)
            else:
                self.logger.error(
                    '[{}] Skip task with url {}, attempt {}'.format(
                        task.name, task.url, task.task_try_count))

            return

        try:
            # parse pagination numbers
            if not task.get('d_skip_page_check'):
                items = grab.doc.select('//a[contains(@href, "{}")]'.format(
                    Config.get('SITE_PAGE_PARAM')))
                max_page = get_max_page(items, 1)
                self.logger.info('[{}] Find max page: {}'.format(
                    task.name, max_page))

                url_gen = UrlGenerator(task.url, Config.get('SITE_PAGE_PARAM'))

                # self-execute from 2 page (if needed)
                for p in range(2, max_page + 1):
                    url = url_gen.get_page(p)
                    yield Task('parse_items',
                               url=url,
                               priority=100,
                               d_skip_page_check=True,
                               raw=True)

            # parse items
            items_list = grab.doc.select(
                '//div[@class="cart_table"]/div/div/table/tbody/tr')

            for index, row in enumerate(items_list):
                try:
                    # NAME
                    item_name = row.select(
                        './td[1]//div[@class="description"]/div/a').text(
                        ).strip()

                    # UNIT
                    unit = row.select('./td[2]').text().strip()
                    if unit == '':
                        unit = 'ед.'

                    # PRICE
                    price_raw = row.select(
                        './td[6]//meta[@itemprop="lowprice"]').attr('content')
                    match = Ree.float.match(price_raw)
                    # check & fix
                    if not match:
                        self.logger.warning(
                            '[{}] Skip item, because price is {} (line: {})'.
                            format(task.name, price_raw, index))
                        continue

                    price = match.groupdict()['price'].replace(',', '.')

                    # COUNT
                    count = row.select('./td[5]')
                    count_text = count.text().strip()

                    # case 1: string line
                    if count_text == 'распродано':
                        item_count = self.const_price_on_request
                        item_place = self.const_default_place

                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 2: string line
                    elif count_text == 'под заказ':
                        item_count = self.const_stock_zero
                        item_place = self.const_default_place
                        # OUTPUT
                        self.logger.debug(
                            '[{}] Item added, index {} at url {}'.format(
                                task.name, index, task.url))
                        self.result.append({
                            'name': item_name,
                            'count': item_count,
                            'unit': unit,
                            'price': price,
                            'place': item_place
                        })

                    # case 3
                    else:
                        count_rows = count.select(
                            './/div[@class="layer_info"]/table/tbody/tr')

                        for count_row in count_rows:
                            item_place = count_row.select(
                                './td[1]').text().strip()
                            item_count = 0

                            # add stock
                            place_count_stock = count_row.select(
                                './td[1]').text().strip()
                            if Ree.float.match(place_count_stock):
                                item_count += float(place_count_stock)

                            # add expo
                            place_count_expo = count_row.select(
                                './td[2]').text().strip()
                            if Ree.float.match(place_count_expo):
                                item_count += float(place_count_expo)

                            if item_count > 0:
                                # OUTPUT
                                self.logger.debug(
                                    '[{}] Item added, index {} at url {}'.
                                    format(task.name, index, task.url))
                                self.result.append({
                                    'name':
                                    item_name,
                                    # 3.140 -> 3.14; 3.0 -> 3
                                    'count':
                                    '{0:g}'.format(item_count),
                                    'unit':
                                    unit,
                                    'price':
                                    price,
                                    'place':
                                    item_place
                                })
                except IndexError as e:
                    self.logger.warning('[{}] Skip item: {}, {}'.format(
                        task.name,
                        type(e).__name__, task.url))

        except Exception as e:
            self._process_error(grab, task, e)

        finally:
            self.logger.info('[{}] Finish: {}'.format(task.name, task.url))