def g_news_item(self, article, start_url="", meta=None): if article.get("has_video"): return None docid = article["source_url"] crawl_url = self._g_crawl_url(article) key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news( crawl_url=crawl_url, key=key, title=article["title"], tags=article.get("keywords", "").split(","), summary=article.get("abstract", ""), publish_time=str_from_timestamp(article["publish_time"]), love=article.get("favorite_count", 0), up=article.get("digg_count", 0), down=article.get("bury_count", 0), original_url=article.get("url", ""), original_source=article.get("source", ""), crawl_source=CRAWL_SOURCE, start_url=start_url, start_meta_info=meta, comment_url=self._g_comment_url(docid), comment_queue=COMMENT_SPIDER_NAME + ":start_urls") news["docid"] = news["comment_url"] return news
def parse_item(self, response): body = response.body_as_unicode().encode("utf-8") extractor = GeneralExtractor(body) title, post_date, post_user, summary, content = extractor( self.title_param, self.post_date_param, self.post_user_param, self.summary_param, self.content_param) if not post_user: post_user = self.crawl_source news = get_default_news( crawl_url=response.url, key=g_cache_key(response.url), title=title, publish_time=post_date, original_source=post_user, original_url=response.url, content=content, crawl_source=self.crawl_source, ) print("*" * 50) print("url: %s" % response.url) print("title: %s" % title) print("post date: %s" % post_date) print("post user: %s" % post_user) print("summary: %s" % summary) show(content) print("\n\n")
def g_news_item(self, article, start_url="", meta=None): if article["ctype"] not in ["news", "picture"]: return None # fixme: only support news now docid = article["docid"] crawl_url = self._g_article_url(article.get("url"), docid) if not crawl_url: return None key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(crawl_url=crawl_url, key=key, title=article["title"], summary=article.get("summary", ""), publish_time=article["date"], love=article.get("like", 0), up=article.get("up", 0), original_url=article.get("url", ""), crawl_source=CRAWL_SOURCE, original_source=article.get("source", ""), start_url=start_url, start_meta_info=meta, comment_url=self._g_comment_url(docid), comment_queue=COMMENT_SPIDER_NAME + ":start_urls") news["docid"] = news["comment_url"] return news
def g_news_item(self, article, start_url="", meta=None): """ 生成 news item 对象 :param article: 包含新闻 url, title 字段的字典 :type article: dict :param start_url: 抓取 meta info 的起始 url :type start_url: str :param meta: 附加配置信息 :type meta: dict :return: 新闻 Item :rtype: News.items.NewsItem | None """ crawl_url = article["url"] key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(title=article["title"], crawl_url=crawl_url, docid=crawl_url, key=key, crawl_source=self.crawl_source, start_url=start_url, summary=article.get("summary", ""), start_meta_info=meta) return news
def g_news_item(self, article, start_url="", meta=None): docid = article['url'] crawl_url = self._g_article_url(article['url']) if not crawl_url: return None key = g_cache_key(crawl_url) if news_already_exists(key): return None news = get_default_news(crawl_url=crawl_url, key=key, title=article['title'], docid=docid, start_meta_info=meta, crawl_source=CRAWL_SOURCE) return news
def g_news_item(self, article, start_url="", meta=None): crawl_url = article['crawl_url'] key = g_cache_key(crawl_url) news = get_default_news( crawl_url=crawl_url, key=key, title=article['title'], love=article.get("like", 0), up=article.get("up", 0), original_url=article.get("url", ""), crawl_source=CRAWL_SOURCE, start_url=start_url, start_meta_info=meta, docid=crawl_url, ) return news
def g_news_item(self, article, start_url="", meta=None): crawl_url = article["crawl_url"] comment_url = self._g_comment_url(crawl_url) news = get_default_news( crawl_url=crawl_url, docid=comment_url, key=g_cache_key(article["title"].encode("utf-8")), crawl_source=CRAWL_SOURCE, start_url=start_url, summary=article["summary"], publish_time=article["publish_time"], title=article["title"], start_meta_info=meta, comment_url=comment_url, comment_queue=COMMENT_SPIDER_NAME+":start_urls" ) return None if news_already_exists(news["key"]) else news