def get_data(url): """ :param page_url: 待获取url :return: """ try: # 获取url中的html文本 con = CrawlerUtils.get_html(url, headers=CrawlerUtils.get_headers(COOKIES), proxies=proxies) # 获取css url,及tag,由于文本与数字class 前缀不同,所以需要分别获取 num_re_compile = "<b><span class=\"(.*?)\"></span>" # 定义数字class匹配正则 str_re_compile = '<span class="addr"><span class=\"(.*?)\"></span>' # 定义地址class匹配正则 # 获取css的URL css_url = get_css(con) # 获取数字class前缀 num_svg_tag = get_tag(re.findall(num_re_compile, con)) # 获取文本class前缀 str_svg_tag = get_tag(re.findall(str_re_compile, con)) # 获取css对应名与像素的映射字典 css_and_px_dict = get_css_and_px_dict(css_url) # 对html文本使用 etree.HTML(html)解析,得到Element对象 doc = etree.HTML(con) # 获取所有商家标签 shops = doc.xpath('//div[@id="shop-all-list"]/ul/li') for shop in shops[1:]: # 商铺id shop_id = shop.xpath( './/div[@class="tit"]/a')[0].attrib["data-shopid"] logger.info("shop_id", shop_id) # 店名 shop_name = shop.xpath('.//div[@class="tit"]/a')[0].attrib["title"] shop_name = bytes(shop_name.encode('utf-8')).decode('utf-8') comment_num = 0 per_capita_price = taste = service = environment = 0 # 获取星级评分数 star_num = int( str( shop.xpath('.//div[@class="comment"]/span') [0].attrib["class"]).split("sml-str")[-1]) # 获取点评总数 comment_and_price_datas = shop.xpath('.//div[@class="comment"]') for comment_and_price_data in comment_and_price_datas: _comment_data = comment_and_price_data.xpath( 'a[@class="review-num"]/b/node()') comment_num = get_last_value(css_url, _comment_data, css_and_px_dict, num_svg_tag) comment_num = int(comment_num) if comment_num else 0 if comment_num == 0: return # 获取人均消费价格 _price = comment_and_price_data.xpath( 'a[@class="mean-price"]/b/node()') per_capita_price = get_last_value(css_url, _price, css_and_px_dict, num_svg_tag) per_capita_price = int( per_capita_price[1:]) if per_capita_price else 0 # 获取口味,环境,服务评分 others_num_node = shop.xpath('.//span[@class="comment-list"]/span') for others_datas in others_num_node: if others_datas.xpath('text()') and others_datas.xpath( 'text()')[0] == u"口味": _taste_data = others_datas.xpath('b/node()') taste = get_last_value(css_url, _taste_data, css_and_px_dict, num_svg_tag) taste = float(taste) if taste else 0 if others_datas.xpath('text()') and others_datas.xpath( 'text()')[0] == u"服务": _taste_data = others_datas.xpath('b/node()') service = get_last_value(css_url, _taste_data, css_and_px_dict, num_svg_tag) service = float(service) if service else 0 if others_datas.xpath('text()') and others_datas.xpath( 'text()')[0] == u"环境": _taste_data = others_datas.xpath('b/node()') environment = get_last_value(css_url, _taste_data, css_and_px_dict, num_svg_tag) environment = float(environment) if environment else 0 # 获取地址 _ress = shop.xpath( './/div[@class="tag-addr"]/span[@class="addr"]/node()') address = get_last_value(css_url, _ress, css_and_px_dict, str_svg_tag, is_num=False) data = { 'shop_id': shop_id, 'shop_name': shop_name, 'address': address, 'per_capita_price': per_capita_price, 'total_number_comments': comment_num, 'stars': int(star_num), 'taste': taste, 'surroundings': environment, 'serve': service, } logger.info('开始保存数据: %s' % (data, )) # 保存数据 DBUtils.save_to_db(Shop, data) except Exception as e: logger.error(e)
def spiderDazhong(shop_id, now_page): global page global proxies try: url = "http://www.dianping.com/shop/" + str( shop_id) + "/review_all/p" + str(now_page) print("url==========", url) req = requests.get(url, headers=headers) # , proxies=proxies if CrawlerUtils.get_bs(req.text, 'div#log'): proxies = CrawlerUtils.get_proxies(api_url) page -= 1 return True # 获取svg的阈值与数字集合的映射 css_url = CrawlerUtils.get_css(req.content.decode('utf-8')) str_re_compile = '<span class=\"(.*?)\"></span>' addr_tag = CrawlerUtils.get_bs(req.text, 'div.review-truncated-words') if addr_tag != []: addr_tag = str(addr_tag[0]) else: addr_tag = str( CrawlerUtils.get_bs(req.text, 'div.review-words')[0]) _tag = CrawlerUtils.get_tag(re.findall(str_re_compile, addr_tag)) css_and_px_dict = CrawlerUtils.get_css_and_px_dict(css_url) doc = pq(req.text) # print(doc) if doc: # # 存入csv文件 # out = open('./data/Stu_csv.csv', 'a', newline='', encoding="utf-8") # # 设定写入模式 # csv_write = csv.writer(out, dialect='excel') # shopName = doc("div.review-list-header > h1 > a").text() # shopurl = "http://www.dianping.com" + doc("div.review-list-header > h1 > a").attr("href") # csv_write.writerow(["店铺名称", "店铺网址"]) # csv_write.writerow([shopName, shopurl]) # csv_write.writerow(["用户名", "用户ID链接", "评定星级", "评论描述", "评论详情", "评论时间", "评论商铺", "评论图片","是否VIP","推荐菜品"]) # 解析评论 pinglunLi = doc("div.reviews-items > ul > li").items() li_num = 1 for data in pinglunLi: userName = data( "div.main-review > div.dper-info > a").text() # 用户名 shopName = data( "div.main-review > div.misc-info.clearfix > span.shop" ).text() # userID = '' user_id_link = data( "div.main-review > div.dper-info > a").attr("href") if user_id_link != None: userID = "http://www.dianping.com" + user_id_link start_sapn = data( "div.review-rank > span.sml-rank-stars.sml-str10.star") if start_sapn: startShop = str( start_sapn.attr("class")).split(" ")[1].replace( "sml-str", "") else: startShop = 0 describeShop = data("div.review-rank > span.score").text() msg_values = etree.HTML(req.content.decode('utf-8')).xpath( '//div[@class="reviews-items"]/ul/li[' + str(li_num) + ']/div[@class="main-review"]/div[@class="review-words Hide"]/node()' ) # if msg_values == []: msg_values = etree.HTML(req.content.decode('utf-8')).xpath( '//div[@class="reviews-items"]/ul/li[' + str(li_num) + ']/div[@class="main-review"]/div[@class="review-words"]/node()' ) pinglunShop = CrawlerUtils.get_last_value(css_url, msg_values, css_and_px_dict, _tag, is_num=False) if pinglunShop == '': continue timeShop = data( "div.main-review > div.misc-info.clearfix > span.time" ).text() Shop = data( "div.main-review > div.misc-info.clearfix > span.shop" ).text() imgShop = data( "div > div.review-pictures > ul > li> a").items() imgList = [] for img in imgShop: imgList.append("http://www.dianping.com" + img.attr("href")) vip_span = data("div.main-review > div.dper-info > span") is_vip = 1 if vip_span.attr("class") == "vip" else 0 recommend = data( "div.main-review > div.review-recommend > a").text() print("===========", pinglunShop) uuid = CrawlerUtils.get_md5_value( str(shop_id) + pinglunShop + str(timeShop)) # # 写入具体内容 # csv_write.writerow([userName, userID, startShop, describeShop, pinglunShop, timeShop, Shop, imgList]) # print("successful insert csv!") print(", 用户名:userName: {}\n, " "商铺名称:shoName: {}\n, " "用户ID链接:userID: {}\n, " "评定星级:startShop: {}\n, " "评论描述:describeShop : {}\n, " "星级评分:startShop: {}\n, " "评论详情:pinglunShop:{}\n, " "评论时间:timeShop :{}\n, " "评论商铺:Shop:{}\n, " "评论图片:imgList:{}\n, " "推荐菜品:recommend:{}\n, " "是否vip:is_vip:{} " "\n ".format(userName, shopName, userID, startShop, describeShop, startShop, pinglunShop, timeShop, Shop, imgList, recommend, is_vip)) data = { 'uuid': uuid, 'shop_id': shop_id, 'username': userName, 'shop_name': shopName, 'is_vip': is_vip, 'comment': pinglunShop, 'recommend': recommend, 'stars': startShop, 'comment_create_time': timeShop, } DBUtils.save_to_db(Comment, data) li_num += 1 sleep(1) if not CrawlerUtils.get_tags(req.text, '.NextPage'): return False return True except Exception as e: raise e