def parse_article_detail_html(self, response): """ 文章详情解析 html 版 :param response: :return: """ article_title = response.xpath( '//div[@class="title"]/text()').extract_first(default='') article_pub_time = response.xpath( '//span[@class="time"]/text()').extract_first(default='') article_content = response.xpath( '//div[@class="WB_editor_iframe"]').extract_first(default='') fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get( response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get( response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = article_pub_time fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta[ 'article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item
def parse_article_detail_js(self, response): """ 文章详情解析 js 版 :param response: :return: """ article_detail_body = response.body_as_unicode() article_detail_rule = r'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>' article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body) if not article_detail_re_parse: return article_detail_html = ''.join(article_detail_re_parse) # 转义字符处理 article_detail_html = article_detail_html.replace('\\r', '') article_detail_html = article_detail_html.replace('\\t', '') article_detail_html = article_detail_html.replace('\\n', '') article_detail_html = article_detail_html.replace('\\"', '"') article_detail_html = article_detail_html.replace('\\/', '/') article_detail_doc = fromstring(article_detail_html) article_title_parse = article_detail_doc.xpath( '//h1[@class="title"]/text()') article_title = article_title_parse[0].strip( ) if article_title_parse else '' article_pub_time_parse = article_detail_doc.xpath( '//span[@class="time"]/text()') article_pub_time = self.trans_time(article_pub_time_parse[0].strip( )) if article_pub_time_parse else time.strftime('%Y-%m-%d %H:%M:%S') article_content_parse = article_detail_doc.xpath( '//div[@class="WBA_content"]') article_content = tostring( article_content_parse[0], encoding='unicode').strip() if article_content_parse else '' fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get( response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get( response.meta['channel_id'], '') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc( article_pub_time).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = '' fetch_result_item['article_abstract'] = response.meta[ 'article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item
def parse_article_detail_js(self, response): """ 文章详情解析 js 版 :param response: :return: """ article_detail_body = response.body_as_unicode() article_detail_rule = ur'<script>FM.view\({"ns":.*?"html":"(.*?)"}\)</script>' article_detail_re_parse = re.compile(article_detail_rule, re.S).findall(article_detail_body) if not article_detail_re_parse: return article_detail_html = u''.join(article_detail_re_parse) article_detail_doc = fromstring(article_detail_html.replace( u'\\', u'')) article_title_parse = article_detail_doc.xpath( '//h1[@class="title"]/text()') article_title = article_title_parse[0].strip( ) if article_title_parse else u'' article_pub_time_parse = article_detail_doc.xpath( '//span[@class="time"]/text()') article_pub_time = self.trans_time(article_pub_time_parse[0].strip()) article_content_parse = article_detail_doc.xpath( '//div[@class="WBA_content"]/text()') article_content = article_content_parse[0].strip( ) if article_content_parse else u'' fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get( response.meta['platform_id'], u'') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get( response.meta['channel_id'], u'') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = article_pub_time fetch_result_item['article_url'] = response.url fetch_result_item['article_tags'] = u'' fetch_result_item['article_abstract'] = response.meta[ 'article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item
def parse_detail(self, response): """ 详细页面 :param response: :return: """ print(response.meta) article_content = ''.join([ i.strip() for i in response.xpath('//div[@id="js_content"]/*').extract() ]) # 原创内容处理(处理内容为空) if not article_content: share_source_url = response.xpath( '//a[@id="js_share_source"]/@href').extract_first() yield scrapy.Request(url=share_source_url, callback=self.parse_detail, meta=response.meta) return fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get( response.meta['platform_id'], u'') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get( response.meta['channel_id'], u'') fetch_result_item['article_id'] = response.meta['article_id'] fetch_result_item['article_title'] = response.meta['article_title'] fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = response.meta[ 'article_pub_time'] fetch_result_item['article_url'] = response.meta['article_url'] fetch_result_item['article_tags'] = u'' fetch_result_item['article_abstract'] = response.meta[ 'article_abstract'] fetch_result_item['article_content'] = article_content yield fetch_result_item
def parse_article_detail(self, response): """ 文章详情 :param response: :return: """ toutiao_body = response.body_as_unicode() js_body = parse_toutiao_js_body(toutiao_body, response.meta['detail_url']) if not js_body: return pj = ParseJsTt(js_body=js_body) article_id = pj.parse_js_item_id() article_title = pj.parse_js_title() article_abstract = pj.parse_js_abstract() article_content = pj.parse_js_content() article_pub_time = pj.parse_js_pub_time() article_tags = pj.parse_js_tags() fetch_result_item = FetchResultItem() fetch_result_item['task_id'] = response.meta['task_id'] fetch_result_item['platform_id'] = response.meta['platform_id'] fetch_result_item['platform_name'] = platform_name_map.get( response.meta['platform_id'], '') fetch_result_item['channel_id'] = response.meta['channel_id'] fetch_result_item['channel_name'] = channel_name_map.get( response.meta['channel_id'], '') fetch_result_item['article_id'] = article_id fetch_result_item['article_title'] = article_title fetch_result_item['article_author_id'] = response.meta['follow_id'] fetch_result_item['article_author_name'] = response.meta['follow_name'] fetch_result_item['article_pub_time'] = time_local_to_utc( article_pub_time).strftime('%Y-%m-%d %H:%M:%S') fetch_result_item[ 'article_url'] = response.url or response.meta['detail_url'] fetch_result_item['article_tags'] = article_tags fetch_result_item['article_abstract'] = article_abstract fetch_result_item['article_content'] = article_content yield fetch_result_item