Esempio n. 1
0
 def parse_news(self, response):
     news = response.meta["news"]
     if "window.location.replace" in response.body:
         news["crawl_url"] = news["original_url"]
         news["key"] = g_cache_key(news["crawl_url"])
         yield self.g_news_request(news)
     else:
         redirects = response.request.meta.get("redirect_urls")
         if redirects:
             news["crawl_url"] = response.url
             news["key"] = g_cache_key(news["crawl_url"])
         body = response.body_as_unicode().encode("utf-8")
         if news["crawl_url"].startswith("http://www.yidianzixun.com/"):
             extractor = YiDianZiXunExtractor(body, response.url)
             title, post_date, post_user, summary, content = extractor()
         else:
             try:
                 title, post_date, post_user, summary, tags, content = extract(
                     news["crawl_url"], document=body)
             except Exception as e:
                 self.logger.warning(e.message +
                                     " outer link: %s" % news["crawl_url"])
                 return
         if content:
             news["content"] = content
             news["content_html"] = response.body
             yield news
         else:
             self.logger.warning("content empty: %s" % news["crawl_url"])
Esempio n. 2
0
 def parse_news(self, response):
     news = response.meta["news"]
     redirects = response.request.meta.get("redirect_urls")
     if redirects:
         news["crawl_url"] = response.url
     body = response.body_as_unicode().encode("utf-8")
     title, post_date, post_user, summary, tags, content = extract(news["crawl_url"], document=body)
     # extractor = WechatExtractor(body, response.url)
     # title, post_date, post_user, summary, content = extractor()
     news["title"] = title
     news["publish_time"] = post_date
     news["original_source"] = post_user
     news["original_url"] = news["crawl_url"]
     news["content"] = content
     news["content_html"] = response.body
     yield news
Esempio n. 3
0
 def parse_news(self, response):
     news = response.meta["news"]
     redirects = response.request.meta.get("redirect_urls")
     if redirects:
         news["crawl_url"] = response.url
     body = response.body_as_unicode().encode("utf-8")
     if news["crawl_url"].startswith(DOMAIN):
         extractor = TouTiaoExtractor(body, news["crawl_url"])
         title, post_date, post_user, summary, content = extractor()
     else:
         try:
             title, post_date, post_user, summary, tags, content = extract(
                 news["crawl_url"], document=body)
         except Exception as e:
             self.logger.warning(e.message +
                                 " outer link: %s" % news["crawl_url"])
             return
     if content:
         news["content"] = content
         news["content_html"] = response.body
         yield news
     else:
         self.logger.warning("content empty: %s" % news["crawl_url"])
Esempio n. 4
0
                        source_id, source_name = add_spider_source(
                            i['app_name'])
                        source_names[i['app_name']] = source_id
                    except Exception, e:
                        print e
                        db.news.update(i, {'$set': {'task_status': 3}})
                        continue
                else:
                    source_id = source_names.get(i['app_name'])
                    if not source_id:
                        source_id = source_names.get(i['app_name'] + 'APP')
                    if not source_id:
                        continue
                try:
                    print '______step 1_______'
                    ret = extract(i['link'])
                except:
                    db.news.update(i, {'$set': {'task_status': 3}})
                    continue
                if not ret[5] or not ret[0] or not ret[1]:
                    db.news.update(i, {'$set': {'task_status': 2}})
                    continue

                is_eng = True
                for j in i['title']:
                    if u'\u4e00' < j < u'\u9fa5':
                        is_eng = False
                if is_eng:
                    db.news.update(i, {'$set': {'task_status': 6}})
                    continue
                item = dict()
Esempio n. 5
0
    def parse_news(self, response):
        """
        解析具体的新闻内容
        :param response: scrapy 返回对象
        :type response: scrapy.Response
        """
        news = response.meta["news"]
        body = response.body_as_unicode().encode("utf-8")
        title_param = self.title_param if hasattr(self,
                                                  "title_param") else None
        post_date_param = self.post_date_param if hasattr(
            self, "post_date_param") else None
        post_source_param = self.post_source_param if hasattr(
            self, "post_source_param") else None
        summary_param = self.summary_param if hasattr(
            self, "summary_param") else None
        content_param = self.content_param if hasattr(
            self, "content_param") else None
        clean_param_list = self.clean_param_list if hasattr(
            self, "clean_param_list") else None
        clean_content_before_param = self.clean_content_before_param if hasattr(
            self, "clean_content_before_param") else None
        clean_content_after_param = self.clean_content_after_param if hasattr(
            self, "clean_content_after_param") else None
        news["content_html"] = body
        if content_param is None:  # 使用新闻解析包来解析
            try:
                title, post_date, post_user, summary, tags, content = extract(
                    news["crawl_url"], document=body)
            except Exception as e:
                self.logger.warning(e.message + " newsextract error: %s" %
                                    news["crawl_url"])
            else:
                if content:
                    if not post_date:
                        post_date = get_date_time_default()
                    news["publish_time"] = post_date
                    news["content"] = content
                    if post_user:
                        news["original_source"] = post_user
                    yield news
                else:
                    self.logger.warning("publish_time or content empty: %s" %
                                        news["crawl_url"])
            return

        if hasattr(self, "extractor_cls"):
            extractor_cls = load_object(path=self.extractor_cls)
        else:
            extractor_cls = GeneralExtractor
        extractor = extractor_cls(body, response.url)
        title, post_date, post_user, summary, content = extractor(
            title_param=title_param,
            post_date_param=post_date_param,
            post_source_param=post_source_param,
            summary_param=summary_param,
            content_param=content_param,
            clean_param_list=clean_param_list,
            clean_content_before_param=clean_content_before_param,
            clean_content_after_param=clean_content_after_param,
        )
        news["publish_time"] = post_date
        news["content"] = content
        # test code
        title_spider_names = {
            'spider:news:www_people_com_cn',
            'spider:news:www_zaobao_com',
            'spider:news:www_chinatimes_com',
        }

        if self.name in title_spider_names:
            news['title'] = title
        # end test code
        yield news