def parse_news(self, response): news = response.meta["news"] data = load_json_data(response.body_as_unicode().encode('utf8')) data = data[news['docid']] if data and data.get("body"): data['body'] = '<div id="inner_article">' + data["body"] + "</div>" keyword = u'您的新闻客户端版本太低啦,升级之后就能看到更丰富的新闻形式了' if keyword in data['body']: return # title, post_date, post_user, summary, content = extractor() # news["content"] = content # news["content_html"] = body # if len(news["content"]) == 0: # return # else: # yield newsii content_html = data['body'] if 'img' in data and data['img']: content_html = self._replace_img_tag_in_html(data) print data news['content_html'] = content_html extractor = G3News163Extractor(content_html) title, post_date, post_user, summary, content = extractor() news['content'] = content if len(news['content']) == 0: return else: yield news else: self.logger.warning("can't get content url: %s body: %s" % (response.url, response.body_as_unicode()))
def g_ajax_news_meta_list(self, response): """ ajax 请求, 返回内容多为 json 格式, 根据配置信息, 定位相应数据段 :param response: scrapy 返回对象 :type response: scrapy.Response :return: 解析得到的新闻信息 :rtype: list[dict] """ body = load_json_data(response.body_as_unicode()) if body is None: self.logger.warning("can't get data: url: %s body: %s" % (response.url, response.body_as_unicode())) return [] items = self.get_dict_value(body, self.items_xpath) self.logger.info("item len: %s" % len(items)) articles = list() for item in items: article = dict() article["title"] = self.get_dict_value(item, self.title_xpath) article["url"] = self.get_dict_value(item, self.url_xpath) if hasattr(self, "summary_xpath"): article["summary"] = self.get_dict_value( item, self.summary_xpath) if hasattr(self, "thumb_xpath"): article["thumb"] = self.get_dict_value(item, self.thumb_xpath) articles.append(article) return articles
def g_news_meta_list(self, response): articles = load_json_data(response.body) if articles is None: self.logger.error("spider has been banned for %s" % response.request.url) return [] else: return articles
def g_news_meta_list(self, response): data = load_json_data(response.body) if data is not None: return data.get("data", []) else: self.logger.warning("can't get data: url: %s body: %s" % (response.url, response.body_as_unicode())) return []
def g_news_meta_list(self, response): ret_content = response.body data = load_json_data(ret_content) keys = data.keys() articles = data[keys[0]] if articles is None: self.logger.error("spider has been banned for %s" % response.request.url) return [] else: return articles
def parse_news(self, response): news = response.meta["news"] data = load_json_data(response.body) if data and data.get("content"): body = '<div id="inner_article">' + data["content"] + "</div>" extractor = News163Extractor(body) title, post_date, post_user, summary, content = extractor() news["content"] = content news["content_html"] = body if len(news["content"]) == 0: return else: yield news else: self.logger.warning("can't get content url: %s body: %s" % (response.url, response.body_as_unicode()))
def g_news_meta_list(self, response): data = load_json_data(response.body[5:-3]) if data is not None: ret = list() get_list = data.get("getList", []) soup = BeautifulSoup(get_list, 'lxml') lis = soup.find_all('li') for i in lis: item = dict() item['url'] = i.find('a')['href'] item['title'] = i.find('a')['title'] ret.append(item) return ret else: self.logger.warning("can't get data: url: %s body: %s" % (response.url, response.body_as_unicode())) return []