Beispiel #1
0
 def parse(self, response):
     item = YanbItem()
     resp = response.text
     extractor = GeneralNewsExtractor()
     result = extractor.extract(resp, with_body_html=False)
     title = result['title']
     txt = result['content']
     p_time = result['publish_time']
     content_css = ['.body']
     for content in content_css:
         content = ''.join(response.css(content).extract())
         if content:
             break
         if not content:
             logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent')
     appendix, appendix_name = get_attachments(response)
     tags, _, _ = get_category(txt + title)
     industry = ''
     item['title'] = title
     item['p_time'] = get_times(str(p_time))
     item['industry'] = industry
     item['appendix'] = appendix
     item['appendix_name'] = appendix_name
     item['content'] = ''.join(content)
     item['pub'] = '链塔'
     item['ctype'] = 3
     item['website'] = '链塔'
     item['txt'] = ''.join(txt).strip()
     item['link'] = response.url
     item['spider_name'] = 'YB_LT'
     item['module_name'] = '研报'
     item['tags'] = tags
     if content:
         yield item
Beispiel #2
0
 def parse_item(self, response):
     title = response.meta['title']
     appendix = response.meta['appendix']
     p_time = response.meta['p_time']
     content = response.css(".details-content ").extract()
     txt = response.css(".details-content ::text").extract()
     txt = ''.join(txt).replace('\t', '').replace('\r', '').replace('\n', '').replace('\xa0', '').replace(
         '\u3000', '').replace('\ue004', '').replace(' ', '')
     _, appendix_name = get_attachments(response)
     tags, _, _ = get_category(txt + title)
     industry = ''
     item = YanbItem()
     item['title'] = title
     item['p_time'] = get_times(str(p_time))
     item['industry'] = industry
     item['pub'] = '京东大数据研究院'
     item['ctype'] = 3
     item['website'] = '京东大数据研究院'
     item['link'] = response.url
     item['spider_name'] = 'jd'
     item['module_name'] = '研报'
     item['tags'] = tags
     item['appendix'] = appendix
     item['appendix_name'] = appendix_name
     item['content'] = ''.join(content)
     item['txt'] = txt
     if content:
         yield item