コード例 #1
0
ファイル: g3_163_com.py プロジェクト: xiaol/NewsCrawlerPG
    def parse_news(self, response):
        news = response.meta["news"]
        data = load_json_data(response.body_as_unicode().encode('utf8'))
        data = data[news['docid']]
        if data and data.get("body"):
            data['body'] = '<div id="inner_article">' + data["body"] + "</div>"
            keyword = u'您的新闻客户端版本太低啦,升级之后就能看到更丰富的新闻形式了'
            if keyword in data['body']:
                return
            # title, post_date, post_user, summary, content = extractor()
            # news["content"] = content
            # news["content_html"] = body
            # if len(news["content"]) == 0:
            #     return
            # else:
            #     yield newsii
            content_html = data['body']
            if 'img' in data and data['img']:
                content_html = self._replace_img_tag_in_html(data)
                print data
            news['content_html'] = content_html

            extractor = G3News163Extractor(content_html)
            title, post_date, post_user, summary, content = extractor()
            news['content'] = content
            if len(news['content']) == 0:
                return
            else:
                yield news

        else:
            self.logger.warning("can't get content url: %s body: %s" %
                                (response.url, response.body_as_unicode()))
コード例 #2
0
ファイル: __init__.py プロジェクト: xiaol/NewsCrawlerPG
 def g_ajax_news_meta_list(self, response):
     """
     ajax 请求, 返回内容多为 json 格式, 根据配置信息, 定位相应数据段
     :param response: scrapy 返回对象
     :type response: scrapy.Response
     :return: 解析得到的新闻信息
     :rtype: list[dict]
     """
     body = load_json_data(response.body_as_unicode())
     if body is None:
         self.logger.warning("can't get data: url: %s body: %s" %
                             (response.url, response.body_as_unicode()))
         return []
     items = self.get_dict_value(body, self.items_xpath)
     self.logger.info("item len: %s" % len(items))
     articles = list()
     for item in items:
         article = dict()
         article["title"] = self.get_dict_value(item, self.title_xpath)
         article["url"] = self.get_dict_value(item, self.url_xpath)
         if hasattr(self, "summary_xpath"):
             article["summary"] = self.get_dict_value(
                 item, self.summary_xpath)
         if hasattr(self, "thumb_xpath"):
             article["thumb"] = self.get_dict_value(item, self.thumb_xpath)
         articles.append(article)
     return articles
コード例 #3
0
 def g_news_meta_list(self, response):
     articles = load_json_data(response.body)
     if articles is None:
         self.logger.error("spider has been banned for %s" %
                           response.request.url)
         return []
     else:
         return articles
コード例 #4
0
ファイル: toutiao.py プロジェクト: xiaol/NewsCrawlerPG
 def g_news_meta_list(self, response):
     data = load_json_data(response.body)
     if data is not None:
         return data.get("data", [])
     else:
         self.logger.warning("can't get data: url: %s body: %s" %
                             (response.url, response.body_as_unicode()))
         return []
コード例 #5
0
ファイル: g3_163_com.py プロジェクト: xiaol/NewsCrawlerPG
    def g_news_meta_list(self, response):
        ret_content = response.body
        data = load_json_data(ret_content)
        keys = data.keys()
        articles = data[keys[0]]
        if articles is None:
            self.logger.error("spider has been banned for %s" %
                              response.request.url)
            return []

        else:
            return articles
コード例 #6
0
 def parse_news(self, response):
     news = response.meta["news"]
     data = load_json_data(response.body)
     if data and data.get("content"):
         body = '<div id="inner_article">' + data["content"] + "</div>"
         extractor = News163Extractor(body)
         title, post_date, post_user, summary, content = extractor()
         news["content"] = content
         news["content_html"] = body
         if len(news["content"]) == 0:
             return
         else:
             yield news
     else:
         self.logger.warning("can't get content url: %s body: %s" %
                             (response.url, response.body_as_unicode()))
コード例 #7
0
ファイル: oushinet.py プロジェクト: xiaol/NewsCrawlerPG
 def g_news_meta_list(self, response):
     data = load_json_data(response.body[5:-3])
     if data is not None:
         ret = list()
         get_list = data.get("getList", [])
         soup = BeautifulSoup(get_list, 'lxml')
         lis = soup.find_all('li')
         for i in lis:
             item = dict()
             item['url'] = i.find('a')['href']
             item['title'] = i.find('a')['title']
             ret.append(item)
         return ret
     else:
         self.logger.warning("can't get data: url: %s body: %s" %
                             (response.url, response.body_as_unicode()))
         return []