def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '生意宝' content_css = [ '.zstexts', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') classify, codes, region = get_category(txt) item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYB' item['module_name'] = '行业新闻' item['cate'] = classify item['region'] = region item['code'] = codes item['link'] = lyurl item['website'] = lyname if content: yield item
def parse_item(self, response): item = HyNewsItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = response.css('#zxwk_left_1 h2::text').extract_first() txt = result['content'] p_time = result['publish_time'] lyurl = response.url lyname = '石油在线' content_css = [ '#zxwk_left_1', ] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') item['title'] = title item['txt'] = txt item['p_time'] = get_times(p_time) item['content'] = content item['spider_name'] = 'HY_SYZX' item['module_name'] = '石油在线' item['cate'] = '石油' item['region'] = '' item['code'] = '' item['link'] = lyurl item['website'] = lyname if content: yield item