Esempio n. 1
0
 def parse_news(self, response):
     news = response.meta["news"]
     if "window.location.replace" in response.body:
         news["crawl_url"] = news["original_url"]
         news["key"] = g_cache_key(news["crawl_url"])
         yield self.g_news_request(news)
     else:
         redirects = response.request.meta.get("redirect_urls")
         if redirects:
             news["crawl_url"] = response.url
             news["key"] = g_cache_key(news["crawl_url"])
         body = response.body_as_unicode().encode("utf-8")
         if news["crawl_url"].startswith("http://www.yidianzixun.com/"):
             extractor = YiDianZiXunExtractor(body, response.url)
             title, post_date, post_user, summary, content = extractor()
         else:
             try:
                 title, post_date, post_user, summary, tags, content = extract(
                     news["crawl_url"], document=body)
             except Exception as e:
                 self.logger.warning(e.message +
                                     " outer link: %s" % news["crawl_url"])
                 return
         if content:
             news["content"] = content
             news["content_html"] = response.body
             yield news
         else:
             self.logger.warning("content empty: %s" % news["crawl_url"])
Esempio n. 2
0
def parse_item(self, response):
    body = response.body_as_unicode().encode("utf-8")
    extractor = GeneralExtractor(body)
    title, post_date, post_user, summary, content = extractor(
        self.title_param, self.post_date_param, self.post_user_param,
        self.summary_param, self.content_param)
    if not post_user:
        post_user = self.crawl_source
    news = get_default_news(
        crawl_url=response.url,
        key=g_cache_key(response.url),
        title=title,
        publish_time=post_date,
        original_source=post_user,
        original_url=response.url,
        content=content,
        crawl_source=self.crawl_source,
    )
    print("*" * 50)
    print("url: %s" % response.url)
    print("title: %s" % title)
    print("post date: %s" % post_date)
    print("post user: %s" % post_user)
    print("summary: %s" % summary)
    show(content)
    print("\n\n")
Esempio n. 3
0
    def g_news_item(self, article, start_url="", meta=None):
        news = NewsItem()
        news["docid"] = article["docID"]
        url_163 = article.get("url_163", None)
        if url_163 is None: return None
        news["crawl_url"] = self._g_crawl_url(url_163)
        news["key"] = g_cache_key(news["crawl_url"])
        if news_already_exists(news["key"]): return None
        news["title"] = article["title"]
        news["tags"] = list()
        news["summary"] = article["summary"]
        news["publish_time"] = str_from_timestamp(article["publish_time"] /
                                                  1000)
        news["content"] = list()
        position = article.get("position", "null,null,null,null").split(",")
        news["province"] = position[0] if position[0] != "null" else None
        news["city"] = position[1] if position[1] != "null" else None
        news["district"] = position[2] if position[2] != "null" else None
        news["love"] = 0
        news["up"] = 0
        news["down"] = 0

        news["original_url"] = article.get("doc_url", "")
        news["channel"] = article.get("channel", "/").split("/")[0]
        news["category"] = article.get("category", "")
        news["crawl_source"] = CRAWL_SOURCE
        news["original_source"] = article.get("source", "")
        if news['original_source'] == u'糗事百科':
            return None

        news["comment_url"] = self._g_comment_url(docid=news["docid"])
        news["comment_queue"] = COMMENT_SPIDER_NAME + ":start_urls"
        news["start_url"] = start_url
        news["start_meta_info"] = meta
        return news
Esempio n. 4
0
 def g_news_item(self, article, start_url="", meta=None):
     if article.get("has_video"):
         return None
     docid = article["source_url"]
     crawl_url = self._g_crawl_url(article)
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(
         crawl_url=crawl_url,
         key=key,
         title=article["title"],
         tags=article.get("keywords", "").split(","),
         summary=article.get("abstract", ""),
         publish_time=str_from_timestamp(article["publish_time"]),
         love=article.get("favorite_count", 0),
         up=article.get("digg_count", 0),
         down=article.get("bury_count", 0),
         original_url=article.get("url", ""),
         original_source=article.get("source", ""),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         start_meta_info=meta,
         comment_url=self._g_comment_url(docid),
         comment_queue=COMMENT_SPIDER_NAME + ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Esempio n. 5
0
 def g_news_item(self, article, start_url="", meta=None):
     if article["ctype"] not in ["news", "picture"]:
         return None  # fixme: only support news now
     docid = article["docid"]
     crawl_url = self._g_article_url(article.get("url"), docid)
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article["title"],
                             summary=article.get("summary", ""),
                             publish_time=article["date"],
                             love=article.get("like", 0),
                             up=article.get("up", 0),
                             original_url=article.get("url", ""),
                             crawl_source=CRAWL_SOURCE,
                             original_source=article.get("source", ""),
                             start_url=start_url,
                             start_meta_info=meta,
                             comment_url=self._g_comment_url(docid),
                             comment_queue=COMMENT_SPIDER_NAME +
                             ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Esempio n. 6
0
 def g_news_item(self, article, start_url="", meta=None):
     """
     生成 news item 对象
     :param article: 包含新闻 url, title 字段的字典
     :type article: dict
     :param start_url: 抓取 meta info 的起始 url
     :type start_url: str
     :param meta: 附加配置信息
     :type meta: dict
     :return: 新闻 Item
     :rtype: News.items.NewsItem | None
     """
     crawl_url = article["url"]
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(title=article["title"],
                             crawl_url=crawl_url,
                             docid=crawl_url,
                             key=key,
                             crawl_source=self.crawl_source,
                             start_url=start_url,
                             summary=article.get("summary", ""),
                             start_meta_info=meta)
     return news
Esempio n. 7
0
 def g_news_item(self, article, start_url="", meta=None):
     docid = article['url']
     crawl_url = self._g_article_url(article['url'])
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article['title'],
                             docid=docid,
                             start_meta_info=meta,
                             crawl_source=CRAWL_SOURCE)
     return news
Esempio n. 8
0
 def g_news_item(self, article, start_url="", meta=None):
     crawl_url = article['crawl_url']
     key = g_cache_key(crawl_url)
     news = get_default_news(
         crawl_url=crawl_url,
         key=key,
         title=article['title'],
         love=article.get("like", 0),
         up=article.get("up", 0),
         original_url=article.get("url", ""),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         start_meta_info=meta,
         docid=crawl_url,
     )
     return news
Esempio n. 9
0
 def g_news_item(self, article, start_url="", meta=None):
     crawl_url = article["crawl_url"]
     comment_url = self._g_comment_url(crawl_url)
     news = get_default_news(
         crawl_url=crawl_url,
         docid=comment_url,
         key=g_cache_key(article["title"].encode("utf-8")),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         summary=article["summary"],
         publish_time=article["publish_time"],
         title=article["title"],
         start_meta_info=meta,
         comment_url=comment_url,
         comment_queue=COMMENT_SPIDER_NAME+":start_urls"
     )
     return None if news_already_exists(news["key"]) else news
Esempio n. 10
0
    def g_news_item(self, article, start_url="", meta=None):
        news = NewsItem()
        news["docid"] = article["docid"]
        news["crawl_url"] = self._g_crawl_url(news['docid'])
        news["key"] = g_cache_key(news["crawl_url"])
        if news_already_exists(news["key"]): return None
        news["title"] = article["title"]
        if 'ltitle' in news and news['ltitle']:
            news['summary'] = article['ltitle']
        news["tags"] = list()
        news["publish_time"] = article["ptime"]
        news["content"] = list()
        news["love"] = 0
        news["up"] = 0
        news["down"] = 0

        news["crawl_source"] = CRAWL_SOURCE
        news['original_url'] = ''
        news["original_source"] = article.get('source', '')
        # news["comment_url"] = self._g_comment_url(docid=news["docid"])
        news["start_url"] = start_url
        news["start_meta_info"] = meta
        return news