Beispiel #1
0
    def _parse_detail_page(self, item):
        url = item.get('url')
        if not url:
            if 'sc' not in item:
                item['sc'] = ''
            item['issue_time'] = ''
            item['title'] = ''
            return

        detail_pattern = item['_patterns']['detail_pattern']
        logger.info('parse detail: %s', url)

        try:
            res = downloader.get(url, headers=headers)
            if res.status_code == 404:
                logger.debug('skip 404 url: %s', item['url'])
                del item['url']
                return
            page = res.content
            encoding = chardet.detect(page)['encoding']
            if encoding == 'GB2312':
                encoding = 'gb18030'
            elif encoding is None:
                encoding = 'utf-8'
            page = page.decode(encoding, 'ignore')

            for key, pattern in detail_pattern.items():
                _parser = build_parser(pattern)
                _parser.parse(page)

                item[key] = _parser.result
        except Exception as e:
            logger.error("parse detail page error: %s", url, exc_info=True)
            raise
Beispiel #2
0
 def test_multi_xpath(self):
     pattern = {
         'pattern': '//a[@class="sister"]',
         'type': 'xpath',
         'target': 'html'
     }
     _parser = parser.build_parser(pattern)
     _parser.parse(html_doc)
     self.assertEqual(len(_parser.result), 3)
Beispiel #3
0
 def test_xpath_text(self):
     pattern = {
         'pattern': '//p[@class="title"]',
         'type': 'xpath',
         'target': 'text'
     }
     _parser = parser.build_parser(pattern)
     _parser.parse(html_doc)
     self.assertEqual(_parser.result, 'The Dormouse\'s story')
Beispiel #4
0
 def test_xpath_html(self):
     pattern = {
         'pattern': '//p[@class="title"]',
         'type': 'xpath',
         'target': 'html'
     }
     _parser = parser.build_parser(pattern)
     _parser.parse(html_doc)
     self.assertEqual(_parser.result,
                      '<p class="title"><b>The Dormouse\'s story</b></p>')
Beispiel #5
0
    def process_page(self, item_, base_url, page, index_pattern):
        """处理列表页"""
        list_pattern = index_pattern['_list']
        _parser = build_parser(list_pattern)
        _parser.parse(page)
        cur_li_hash = hashlib.md5(json.dumps(_parser.result)).hexdigest()
        if cur_li_hash == self._last_li_hash:
            # 两次请求列表页元素一致
            # 可能在最后一页,直接跳出
            logger.debug('break click loop')
            self._click_next = False
            return
        self._last_li_hash = cur_li_hash
        # 处理列表页详情链接
        for element in _parser.source:
            # 获取详情时,清空每个item
            item = copy.deepcopy(item_)
            item['url'] = self._get_item_link(base_url, element)

            for key, pattern in index_pattern.items():
                if key in ['_list', '_next_page']:
                    continue
                if pattern['pattern'].startswith('/html'):
                    _element = page
                else:
                    _element = element
                _parser = build_parser(pattern)
                _parser.parse(_element)
                item[key] = _parser.result
            # 执行自定义函数,处理列表页元素及item
            self._module.process_list_item(element, item)
            # 解析详情页
            self._parse_detail_page(item)
            if item.get('url'):
                # 执行自定义函数,处理获取详情页后的item
                self._module.process_detail_item(item)
                # 执行项目默认函数
                self._proj_module.process_item(item)
                # 保存item
                self._save_item(item)

            self._click_next = item.get('_click_next', True)
Beispiel #6
0
 def test_multi_css(self):
     pattern = {'pattern': 'a.sister', 'type': 'css', 'target': 'html'}
     _parser = parser.build_parser(pattern)
     _parser.parse(html_doc)
     self.assertEqual(len(_parser.result), 3)
Beispiel #7
0
 def test_css_text(self):
     pattern = {'pattern': 'p.title', 'type': 'css', 'target': 'text'}
     _parser = parser.build_parser(pattern)
     _parser.parse(html_doc)
     self.assertEqual(_parser.result, 'The Dormouse\'s story')
Beispiel #8
0
 def test_build_parser_css(self):
     pattern = {'pattern': 'a', 'type': 'css', 'target': 'html'}
     _parser = parser.build_parser(pattern)
     self.assertIsInstance(_parser, parser.CSSParser)
Beispiel #9
0
 def test_build_parser_xpath(self):
     pattern = {'pattern': '//a', 'type': 'xpath', 'target': 'html'}
     _parser = parser.build_parser(pattern)
     self.assertIsInstance(_parser, parser.XPathParser)