Esempio n. 1
0
 def g_news_item(self, article, start_url="", meta=None):
     if article.get("has_video"):
         return None
     docid = article["source_url"]
     crawl_url = self._g_crawl_url(article)
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(
         crawl_url=crawl_url,
         key=key,
         title=article["title"],
         tags=article.get("keywords", "").split(","),
         summary=article.get("abstract", ""),
         publish_time=str_from_timestamp(article["publish_time"]),
         love=article.get("favorite_count", 0),
         up=article.get("digg_count", 0),
         down=article.get("bury_count", 0),
         original_url=article.get("url", ""),
         original_source=article.get("source", ""),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         start_meta_info=meta,
         comment_url=self._g_comment_url(docid),
         comment_queue=COMMENT_SPIDER_NAME + ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Esempio n. 2
0
    def g_news_item(self, article, start_url="", meta=None):
        news = NewsItem()
        news["docid"] = article["docID"]
        url_163 = article.get("url_163", None)
        if url_163 is None: return None
        news["crawl_url"] = self._g_crawl_url(url_163)
        news["key"] = g_cache_key(news["crawl_url"])
        if news_already_exists(news["key"]): return None
        news["title"] = article["title"]
        news["tags"] = list()
        news["summary"] = article["summary"]
        news["publish_time"] = str_from_timestamp(article["publish_time"] /
                                                  1000)
        news["content"] = list()
        position = article.get("position", "null,null,null,null").split(",")
        news["province"] = position[0] if position[0] != "null" else None
        news["city"] = position[1] if position[1] != "null" else None
        news["district"] = position[2] if position[2] != "null" else None
        news["love"] = 0
        news["up"] = 0
        news["down"] = 0

        news["original_url"] = article.get("doc_url", "")
        news["channel"] = article.get("channel", "/").split("/")[0]
        news["category"] = article.get("category", "")
        news["crawl_source"] = CRAWL_SOURCE
        news["original_source"] = article.get("source", "")
        if news['original_source'] == u'糗事百科':
            return None

        news["comment_url"] = self._g_comment_url(docid=news["docid"])
        news["comment_queue"] = COMMENT_SPIDER_NAME + ":start_urls"
        news["start_url"] = start_url
        news["start_meta_info"] = meta
        return news
Esempio n. 3
0
 def g_news_item(self, article, start_url="", meta=None):
     if article["ctype"] not in ["news", "picture"]:
         return None  # fixme: only support news now
     docid = article["docid"]
     crawl_url = self._g_article_url(article.get("url"), docid)
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article["title"],
                             summary=article.get("summary", ""),
                             publish_time=article["date"],
                             love=article.get("like", 0),
                             up=article.get("up", 0),
                             original_url=article.get("url", ""),
                             crawl_source=CRAWL_SOURCE,
                             original_source=article.get("source", ""),
                             start_url=start_url,
                             start_meta_info=meta,
                             comment_url=self._g_comment_url(docid),
                             comment_queue=COMMENT_SPIDER_NAME +
                             ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Esempio n. 4
0
 def g_news_item(self, article, start_url="", meta=None):
     """
     生成 news item 对象
     :param article: 包含新闻 url, title 字段的字典
     :type article: dict
     :param start_url: 抓取 meta info 的起始 url
     :type start_url: str
     :param meta: 附加配置信息
     :type meta: dict
     :return: 新闻 Item
     :rtype: News.items.NewsItem | None
     """
     crawl_url = article["url"]
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(title=article["title"],
                             crawl_url=crawl_url,
                             docid=crawl_url,
                             key=key,
                             crawl_source=self.crawl_source,
                             start_url=start_url,
                             summary=article.get("summary", ""),
                             start_meta_info=meta)
     return news
Esempio n. 5
0
 def g_news_item(self, article, start_url="", meta=None):
     docid = article['url']
     crawl_url = self._g_article_url(article['url'])
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article['title'],
                             docid=docid,
                             start_meta_info=meta,
                             crawl_source=CRAWL_SOURCE)
     return news
Esempio n. 6
0
 def g_news_item(self, article, start_url="", meta=None):
     crawl_url = article["crawl_url"]
     comment_url = self._g_comment_url(crawl_url)
     news = get_default_news(
         crawl_url=crawl_url,
         docid=comment_url,
         key=g_cache_key(article["title"].encode("utf-8")),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         summary=article["summary"],
         publish_time=article["publish_time"],
         title=article["title"],
         start_meta_info=meta,
         comment_url=comment_url,
         comment_queue=COMMENT_SPIDER_NAME+":start_urls"
     )
     return None if news_already_exists(news["key"]) else news
Esempio n. 7
0
    def g_news_item(self, article, start_url="", meta=None):
        news = NewsItem()
        news["docid"] = article["docid"]
        news["crawl_url"] = self._g_crawl_url(news['docid'])
        news["key"] = g_cache_key(news["crawl_url"])
        if news_already_exists(news["key"]): return None
        news["title"] = article["title"]
        if 'ltitle' in news and news['ltitle']:
            news['summary'] = article['ltitle']
        news["tags"] = list()
        news["publish_time"] = article["ptime"]
        news["content"] = list()
        news["love"] = 0
        news["up"] = 0
        news["down"] = 0

        news["crawl_source"] = CRAWL_SOURCE
        news['original_url'] = ''
        news["original_source"] = article.get('source', '')
        # news["comment_url"] = self._g_comment_url(docid=news["docid"])
        news["start_url"] = start_url
        news["start_meta_info"] = meta
        return news