예제 #1
0
    def parse_list_page_common(self, response):
        """
        通用版list页面解析
        必要条件:
        :param response:
        :return:
        """

        assert 'crawl_key' in response.meta
        assert 'page_index' in response.meta
        assert 'param' in response.meta
        assert 'xpath_of_list' in response.meta['param']
        assert 'xpath_of_detail_url' in response.meta['param']
        assert 'item_parse_class' in response.meta['param']

        list_page_content_md5 = hashlib.md5(response.body).hexdigest()
        logging.info(
            'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]'
            .format(response.meta['page_index'], response.url, response.status,
                    list_page_content_md5))

        logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info))

        crawl_key = response.meta['crawl_key']

        # 更新状态表记录
        # self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active')

        if not self.crawl_helper.should_continue_page_parse(
                response, crawl_key, list_page_content_md5):
            return

        _item_idx = 0
        if response.meta['param']['requests_type'] == "dict":
            _request = response.text.encode('utf-8')
            _response_data = json.loads(response.text)
            for _dictn_num in response.meta['param']["xpath_of_list"]:
                _response_data = _response_data[_dictn_num]
            for selector in _response_data:
                _detail_url = ''
                try:
                    _item_idx += 1

                    # http://ggzyjy.sc.gov.cn/jyxx/002004/002004003/20200220/fa95a2be-7763-44b9-b94e-33dae1f82aea.html
                    # /jyxx/002004/002004003/20200220/0b9c5576-6c08-4593-9a2c-44c6ec8e3ff7.html
                    _detail_url = 'http://ggzyjy.sc.gov.cn/' + str(selector[
                        response.meta['param']["xpath_of_detail_url"]])
                    _unq_id = JyScrapyUtil.get_unique_id(_detail_url)

                    logging.info('Parse item, [{}]-[{}/{}]'.format(
                        crawl_key, _item_idx, response.meta['page_index']))

                    # 检查记录是否已在库中,并做相应的跳出动作
                    # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id)
                    # if loop_break:
                    #     return
                    # if item_break:
                    #     continue

                    # 生成并返回爬取item
                    item_parser = response.meta['param']['item_parse_class'](
                        selector)
                    item = item_parser.get_common_raw_item(
                        _id=_unq_id,
                        detail_url=_detail_url,
                        site=self.site,
                        ext_param=response.meta['param'])

                    # 随机休眠
                    time.sleep(random.randint(50, 100) / 1000.0)

                    # 更新数据库中爬取数量
                    # self.crawl_helper.increase_total_item_num(crawl_key)

                    logging.info('item is: {}'.format(item))
                    # yield item

                except Exception as e:
                    logging.exception('Handle [{}] failed'.format(_detail_url))
        else:
            for _selector_num, selector in enumerate(
                    response.xpath(response.meta['param']['xpath_of_list'])):
                _detail_url = ''
                try:
                    _item_idx += 1
                    _url_id = selector.xpath(
                        response.meta['param']
                        ['xpath_of_detail_url']).extract_first()
                    _url_id = _url_id.split("'")[1].replace('\\r\\n', '')
                    _detail_url = 'http://ec.ccccltd.cn/PMS/biddetail.shtml?id=' + str(
                        _url_id)

                    _unq_id = JyScrapyUtil.get_unique_id(_detail_url)

                    logging.info('Parse item, [{}]-[{}/{}]'.format(
                        crawl_key, _item_idx, response.meta['page_index']))

                    # 检查记录是否已在库中,并做相应的跳出动作
                    # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id)
                    # if loop_break:
                    #     return
                    # if item_break:
                    #     continue

                    # 生成并返回爬取item
                    item_parser = response.meta['param']['item_parse_class'](
                        selector)
                    item = item_parser.get_common_raw_item(
                        _id=_unq_id,
                        detail_url=_detail_url,
                        site=self.site,
                        ext_param=response.meta['param'])

                    # 随机休眠
                    time.sleep(random.randint(50, 100) / 1000.0)

                    # 更新数据库中爬取数量
                    # self.crawl_helper.increase_total_item_num(crawl_key)

                    logging.info('item is: {}'.format(item))
                    # yield item
                except Exception as e:
                    logging.exception('Handle [{}] failed'.format(_detail_url))
예제 #2
0
    def parse_list_page_common(self, response):
        """
        通用版list页面解析
        必要条件:
        :param response:
        :return:
        """

        assert 'crawl_key' in response.meta
        assert 'page_index' in response.meta
        assert 'param' in response.meta
        assert 'xpath_of_list' in response.meta['param']
        assert 'xpath_of_detail_url' in response.meta['param']
        assert 'item_parse_class' in response.meta['param']

        list_page_content_md5 = hashlib.md5(response.body).hexdigest()
        logging.info(
            'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]'
            .format(response.meta['page_index'], response.url, response.status,
                    list_page_content_md5))

        logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info))

        crawl_key = response.meta['crawl_key']

        # 更新状态表记录
        self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active')

        if not self.crawl_helper.should_continue_page_parse(
                response, crawl_key, list_page_content_md5):
            return

        _item_idx = 0
        for selector in response.xpath(
                response.meta['param']['xpath_of_list']):

            _detail_url = ''
            try:
                _item_idx += 1
                _detail_url = selector.xpath(
                    response.meta['param']
                    ['xpath_of_detail_url']).extract_first().replace(" ", "")
                _unq_id = JyScrapyUtil.get_unique_id(_detail_url)

                logging.info('Parse item, [{}]-[{}/{}]'.format(
                    crawl_key, _item_idx, response.meta['page_index']))

                # 检查记录是否已在库中,并做相应的跳出动作
                loop_break, item_break = self.crawl_helper.should_continue_item_parse(
                    crawl_key, _unq_id)
                if loop_break:
                    return
                if item_break:
                    continue

                # 生成并返回爬取item
                item_parser = response.meta['param']['item_parse_class'](
                    selector)
                item = item_parser.get_common_raw_item(
                    _id=_unq_id,
                    detail_url=_detail_url,
                    site=self.site,
                    ext_param=response.meta['param'])

                # 随机休眠
                time.sleep(random.randint(50, 100) / 1000.0)

                # 更新数据库中爬取数量
                self.crawl_helper.increase_total_item_num(crawl_key)

                logging.info('item is: {}'.format(item))
                yield item

            except Exception as e:
                logging.exception('Handle [{}] failed'.format(_detail_url))
예제 #3
0
    def parse_list_page_common(self, response):
        """
        通用版list页面解析
        必要条件:
        :param response:
        :return:
        """

        assert 'crawl_key' in response.meta
        assert 'page_index' in response.meta
        assert 'param' in response.meta
        assert 'xpath_of_list' in response.meta['param']
        assert 'xpath_of_detail_url' in response.meta['param']
        assert 'item_parse_class' in response.meta['param']

        list_page_content_md5 = hashlib.md5(response.body).hexdigest()
        logging.info(
            'Get page list url, page:[{}], url:[{}], status:[{}], body md5:[{}]'
            .format(response.meta['page_index'], response.url, response.status,
                    list_page_content_md5))

        logging.info('Crawl info: {}'.format(self.crawl_helper.crawl_info))

        crawl_key = response.meta['crawl_key']

        # 更新状态表记录
        # self.crawl_helper.store_crawl_info_2_db(crawl_key, 'active')

        if not self.crawl_helper.should_continue_page_parse(
                response, crawl_key, list_page_content_md5):
            return

        _item_idx = 0
        if response.meta['param']['requests_type'] == "dict":
            _request = response.text.encode('utf-8')
            _response_data = json.loads(response.text)
            # _dict_xpath = response.meta['param']['xpath_of_list'].split("/")
            # if len(_dict_xpath) > 1:
            for _dictn_num in response.meta['param']["xpath_of_list"]:
                _response_data = _response_data[_dictn_num]
            for selector in _response_data:
                _detail_url = ''
                try:
                    _item_idx += 1
                    # _detail_url = response.urljoin(
                    #     selector.xpath(response.meta['param']['xpath_of_detail_url']).extract_first())
                    _detail_url = response.urljoin(selector[
                        response.meta['param']['xpath_of_detail_url']])
                    _unq_id = JyScrapyUtil.get_unique_id(_detail_url)

                    logging.info('Parse item, [{}]-[{}/{}]'.format(
                        crawl_key, _item_idx, response.meta['page_index']))

                    # 检查记录是否已在库中,并做相应的跳出动作
                    # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id)
                    # if loop_break:
                    #     return
                    # if item_break:
                    #     continue

                    # 生成并返回爬取item
                    item_parser = response.meta['param']['item_parse_class'](
                        selector)
                    item = item_parser.get_common_raw_item(
                        _id=_unq_id,
                        detail_url=_detail_url,
                        site=self.site,
                        ext_param=response.meta['param'])

                    # 随机休眠
                    time.sleep(random.randint(50, 100) / 1000.0)

                    # 更新数据库中爬取数量
                    # self.crawl_helper.increase_total_item_num(crawl_key)

                    logging.info('item is: {}'.format(item))
                    # yield item

                except Exception as e:
                    logging.exception('Handle [{}] failed'.format(_detail_url))
        else:
            for selector in response.xpath(
                    response.meta['param']['xpath_of_list']):
                _detail_url = ''
                try:
                    _item_idx += 1
                    _detail_url = response.urljoin(
                        selector.xpath(
                            response.meta['param']
                            ['xpath_of_detail_url']).extract_first())
                    _unq_id = JyScrapyUtil.get_unique_id(_detail_url)

                    logging.info('Parse item, [{}]-[{}/{}]'.format(
                        crawl_key, _item_idx, response.meta['page_index']))

                    # 检查记录是否已在库中,并做相应的跳出动作
                    # loop_break, item_break = self.crawl_helper.should_continue_item_parse(crawl_key, _unq_id)
                    # if loop_break:
                    #     return
                    # if item_break:
                    #     continue

                    # 生成并返回爬取item
                    item_parser = response.meta['param']['item_parse_class'](
                        selector)
                    item = item_parser.get_common_raw_item(
                        _id=_unq_id,
                        detail_url=_detail_url,
                        site=self.site,
                        ext_param=response.meta['param'])

                    # 随机休眠
                    time.sleep(random.randint(50, 100) / 1000.0)

                    # 更新数据库中爬取数量
                    # self.crawl_helper.increase_total_item_num(crawl_key)

                    logging.info('item is: {}'.format(item))
                    # yield item

                except Exception as e:
                    logging.exception('Handle [{}] failed'.format(_detail_url))