Exemple #1
0
 def parse_item(self, response):
     item = HyNewsItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = result['title']
     txt = result['content']
     p_time = result['publish_time']
     lyurl = response.url
     lyname = '生意宝'
     content_css = [
         '.zstexts',
     ]
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     classify, codes, region = get_category(txt)
     item['title'] = title
     item['txt'] = txt
     item['p_time'] = get_times(p_time)
     item['content'] = content
     item['spider_name'] = 'HY_SYB'
     item['module_name'] = '行业新闻'
     item['cate'] = classify
     item['region'] = region
     item['code'] = codes
     item['link'] = lyurl
     item['website'] = lyname
     if content:
         yield item
Exemple #2
0
 def parse_item(self, response):
     item = HyNewsItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = response.css('#zxwk_left_1 h2::text').extract_first()
     txt = result['content']
     p_time = result['publish_time']
     lyurl = response.url
     lyname = '石油在线'
     content_css = [
         '#zxwk_left_1',
     ]
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     item['title'] = title
     item['txt'] = txt
     item['p_time'] = get_times(p_time)
     item['content'] = content
     item['spider_name'] = 'HY_SYZX'
     item['module_name'] = '石油在线'
     item['cate'] = '石油'
     item['region'] = ''
     item['code'] = ''
     item['link'] = lyurl
     item['website'] = lyname
     if content:
         yield item