def parse_news(self, response): news = response.meta["news"] if "window.location.replace" in response.body: news["crawl_url"] = news["original_url"] news["key"] = g_cache_key(news["crawl_url"]) yield self.g_news_request(news) else: redirects = response.request.meta.get("redirect_urls") if redirects: news["crawl_url"] = response.url news["key"] = g_cache_key(news["crawl_url"]) body = response.body_as_unicode().encode("utf-8") if news["crawl_url"].startswith("http://www.yidianzixun.com/"): extractor = YiDianZiXunExtractor(body, response.url) title, post_date, post_user, summary, content = extractor() else: try: title, post_date, post_user, summary, tags, content = extract( news["crawl_url"], document=body) except Exception as e: self.logger.warning(e.message + " outer link: %s" % news["crawl_url"]) return if content: news["content"] = content news["content_html"] = response.body yield news else: self.logger.warning("content empty: %s" % news["crawl_url"])
def parse_news(self, response): news = response.meta["news"] redirects = response.request.meta.get("redirect_urls") if redirects: news["crawl_url"] = response.url body = response.body_as_unicode().encode("utf-8") title, post_date, post_user, summary, tags, content = extract(news["crawl_url"], document=body) # extractor = WechatExtractor(body, response.url) # title, post_date, post_user, summary, content = extractor() news["title"] = title news["publish_time"] = post_date news["original_source"] = post_user news["original_url"] = news["crawl_url"] news["content"] = content news["content_html"] = response.body yield news
def parse_news(self, response): news = response.meta["news"] redirects = response.request.meta.get("redirect_urls") if redirects: news["crawl_url"] = response.url body = response.body_as_unicode().encode("utf-8") if news["crawl_url"].startswith(DOMAIN): extractor = TouTiaoExtractor(body, news["crawl_url"]) title, post_date, post_user, summary, content = extractor() else: try: title, post_date, post_user, summary, tags, content = extract( news["crawl_url"], document=body) except Exception as e: self.logger.warning(e.message + " outer link: %s" % news["crawl_url"]) return if content: news["content"] = content news["content_html"] = response.body yield news else: self.logger.warning("content empty: %s" % news["crawl_url"])
source_id, source_name = add_spider_source( i['app_name']) source_names[i['app_name']] = source_id except Exception, e: print e db.news.update(i, {'$set': {'task_status': 3}}) continue else: source_id = source_names.get(i['app_name']) if not source_id: source_id = source_names.get(i['app_name'] + 'APP') if not source_id: continue try: print '______step 1_______' ret = extract(i['link']) except: db.news.update(i, {'$set': {'task_status': 3}}) continue if not ret[5] or not ret[0] or not ret[1]: db.news.update(i, {'$set': {'task_status': 2}}) continue is_eng = True for j in i['title']: if u'\u4e00' < j < u'\u9fa5': is_eng = False if is_eng: db.news.update(i, {'$set': {'task_status': 6}}) continue item = dict()
def parse_news(self, response): """ 解析具体的新闻内容 :param response: scrapy 返回对象 :type response: scrapy.Response """ news = response.meta["news"] body = response.body_as_unicode().encode("utf-8") title_param = self.title_param if hasattr(self, "title_param") else None post_date_param = self.post_date_param if hasattr( self, "post_date_param") else None post_source_param = self.post_source_param if hasattr( self, "post_source_param") else None summary_param = self.summary_param if hasattr( self, "summary_param") else None content_param = self.content_param if hasattr( self, "content_param") else None clean_param_list = self.clean_param_list if hasattr( self, "clean_param_list") else None clean_content_before_param = self.clean_content_before_param if hasattr( self, "clean_content_before_param") else None clean_content_after_param = self.clean_content_after_param if hasattr( self, "clean_content_after_param") else None news["content_html"] = body if content_param is None: # 使用新闻解析包来解析 try: title, post_date, post_user, summary, tags, content = extract( news["crawl_url"], document=body) except Exception as e: self.logger.warning(e.message + " newsextract error: %s" % news["crawl_url"]) else: if content: if not post_date: post_date = get_date_time_default() news["publish_time"] = post_date news["content"] = content if post_user: news["original_source"] = post_user yield news else: self.logger.warning("publish_time or content empty: %s" % news["crawl_url"]) return if hasattr(self, "extractor_cls"): extractor_cls = load_object(path=self.extractor_cls) else: extractor_cls = GeneralExtractor extractor = extractor_cls(body, response.url) title, post_date, post_user, summary, content = extractor( title_param=title_param, post_date_param=post_date_param, post_source_param=post_source_param, summary_param=summary_param, content_param=content_param, clean_param_list=clean_param_list, clean_content_before_param=clean_content_before_param, clean_content_after_param=clean_content_after_param, ) news["publish_time"] = post_date news["content"] = content # test code title_spider_names = { 'spider:news:www_people_com_cn', 'spider:news:www_zaobao_com', 'spider:news:www_chinatimes_com', } if self.name in title_spider_names: news['title'] = title # end test code yield news