def parse(self, response): item = YanbItem() resp = response.text extractor = GeneralNewsExtractor() result = extractor.extract(resp, with_body_html=False) title = result['title'] txt = result['content'] p_time = result['publish_time'] content_css = ['.body'] for content in content_css: content = ''.join(response.css(content).extract()) if content: break if not content: logging.warning(f'{response.url}' + '当前url无 css 适配未提取 centent') appendix, appendix_name = get_attachments(response) tags, _, _ = get_category(txt + title) industry = '' item['title'] = title item['p_time'] = get_times(str(p_time)) item['industry'] = industry item['appendix'] = appendix item['appendix_name'] = appendix_name item['content'] = ''.join(content) item['pub'] = '链塔' item['ctype'] = 3 item['website'] = '链塔' item['txt'] = ''.join(txt).strip() item['link'] = response.url item['spider_name'] = 'YB_LT' item['module_name'] = '研报' item['tags'] = tags if content: yield item
def parse_item(self, response): title = response.meta['title'] appendix = response.meta['appendix'] p_time = response.meta['p_time'] content = response.css(".details-content ").extract() txt = response.css(".details-content ::text").extract() txt = ''.join(txt).replace('\t', '').replace('\r', '').replace('\n', '').replace('\xa0', '').replace( '\u3000', '').replace('\ue004', '').replace(' ', '') _, appendix_name = get_attachments(response) tags, _, _ = get_category(txt + title) industry = '' item = YanbItem() item['title'] = title item['p_time'] = get_times(str(p_time)) item['industry'] = industry item['pub'] = '京东大数据研究院' item['ctype'] = 3 item['website'] = '京东大数据研究院' item['link'] = response.url item['spider_name'] = 'jd' item['module_name'] = '研报' item['tags'] = tags item['appendix'] = appendix item['appendix_name'] = appendix_name item['content'] = ''.join(content) item['txt'] = txt if content: yield item