Beispiel #1
0
 def g_news_item(self, article, start_url="", meta=None):
     if article.get("has_video"):
         return None
     docid = article["source_url"]
     crawl_url = self._g_crawl_url(article)
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(
         crawl_url=crawl_url,
         key=key,
         title=article["title"],
         tags=article.get("keywords", "").split(","),
         summary=article.get("abstract", ""),
         publish_time=str_from_timestamp(article["publish_time"]),
         love=article.get("favorite_count", 0),
         up=article.get("digg_count", 0),
         down=article.get("bury_count", 0),
         original_url=article.get("url", ""),
         original_source=article.get("source", ""),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         start_meta_info=meta,
         comment_url=self._g_comment_url(docid),
         comment_queue=COMMENT_SPIDER_NAME + ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Beispiel #2
0
def parse_item(self, response):
    body = response.body_as_unicode().encode("utf-8")
    extractor = GeneralExtractor(body)
    title, post_date, post_user, summary, content = extractor(
        self.title_param, self.post_date_param, self.post_user_param,
        self.summary_param, self.content_param)
    if not post_user:
        post_user = self.crawl_source
    news = get_default_news(
        crawl_url=response.url,
        key=g_cache_key(response.url),
        title=title,
        publish_time=post_date,
        original_source=post_user,
        original_url=response.url,
        content=content,
        crawl_source=self.crawl_source,
    )
    print("*" * 50)
    print("url: %s" % response.url)
    print("title: %s" % title)
    print("post date: %s" % post_date)
    print("post user: %s" % post_user)
    print("summary: %s" % summary)
    show(content)
    print("\n\n")
Beispiel #3
0
 def g_news_item(self, article, start_url="", meta=None):
     if article["ctype"] not in ["news", "picture"]:
         return None  # fixme: only support news now
     docid = article["docid"]
     crawl_url = self._g_article_url(article.get("url"), docid)
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article["title"],
                             summary=article.get("summary", ""),
                             publish_time=article["date"],
                             love=article.get("like", 0),
                             up=article.get("up", 0),
                             original_url=article.get("url", ""),
                             crawl_source=CRAWL_SOURCE,
                             original_source=article.get("source", ""),
                             start_url=start_url,
                             start_meta_info=meta,
                             comment_url=self._g_comment_url(docid),
                             comment_queue=COMMENT_SPIDER_NAME +
                             ":start_urls")
     news["docid"] = news["comment_url"]
     return news
Beispiel #4
0
 def g_news_item(self, article, start_url="", meta=None):
     """
     生成 news item 对象
     :param article: 包含新闻 url, title 字段的字典
     :type article: dict
     :param start_url: 抓取 meta info 的起始 url
     :type start_url: str
     :param meta: 附加配置信息
     :type meta: dict
     :return: 新闻 Item
     :rtype: News.items.NewsItem | None
     """
     crawl_url = article["url"]
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(title=article["title"],
                             crawl_url=crawl_url,
                             docid=crawl_url,
                             key=key,
                             crawl_source=self.crawl_source,
                             start_url=start_url,
                             summary=article.get("summary", ""),
                             start_meta_info=meta)
     return news
Beispiel #5
0
 def g_news_item(self, article, start_url="", meta=None):
     docid = article['url']
     crawl_url = self._g_article_url(article['url'])
     if not crawl_url:
         return None
     key = g_cache_key(crawl_url)
     if news_already_exists(key):
         return None
     news = get_default_news(crawl_url=crawl_url,
                             key=key,
                             title=article['title'],
                             docid=docid,
                             start_meta_info=meta,
                             crawl_source=CRAWL_SOURCE)
     return news
Beispiel #6
0
 def g_news_item(self, article, start_url="", meta=None):
     crawl_url = article['crawl_url']
     key = g_cache_key(crawl_url)
     news = get_default_news(
         crawl_url=crawl_url,
         key=key,
         title=article['title'],
         love=article.get("like", 0),
         up=article.get("up", 0),
         original_url=article.get("url", ""),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         start_meta_info=meta,
         docid=crawl_url,
     )
     return news
Beispiel #7
0
 def g_news_item(self, article, start_url="", meta=None):
     crawl_url = article["crawl_url"]
     comment_url = self._g_comment_url(crawl_url)
     news = get_default_news(
         crawl_url=crawl_url,
         docid=comment_url,
         key=g_cache_key(article["title"].encode("utf-8")),
         crawl_source=CRAWL_SOURCE,
         start_url=start_url,
         summary=article["summary"],
         publish_time=article["publish_time"],
         title=article["title"],
         start_meta_info=meta,
         comment_url=comment_url,
         comment_queue=COMMENT_SPIDER_NAME+":start_urls"
     )
     return None if news_already_exists(news["key"]) else news