Ejemplo n.º 1
0
def get_promo_info(shop_id):
    """
    优惠券信息
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'ttp://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId=' + str(
        shop_id) + '&cityId=19&mainCategoryId=2821&_token=' + str(get_token(
        shop_url)) + '&uuid=38af1c67-4a50-3220-06f6-bf9f16e71c41.1611146098&platform=1&partner=150&optimusCode=10' \
                     '&originUrl=' + shop_url
    r = requests_util.get_requests(url, request_type='json')
    r_text = requests_util.replace_json_text(r.text, get_font_msg())
    r_json = json.loads(r_text)
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        logger.warning('处理验证码,按任意键继续:', verify_page_url)
        input()
    elif r_json['code'] == 200:
        msg = r_json['msg']['shopInfo']
        shop_name = msg['shopName']
        shop_address = BeautifulSoup(msg['address'],
                                     'lxml').text + BeautifulSoup(
                                         msg['crossRoad'], 'lxml').text
        shop_number = BeautifulSoup(msg['phoneNo'],
                                    'lxml').text + BeautifulSoup(
                                        msg['phoneNo2'], 'lxml').text
        return [shop_name, shop_address, shop_number]
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 2
0
def get_basic_hidden_info(shop_id):
    """
    获取基础隐藏信息(名称、地址、电话号、cityid)
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?' \
          'shopId=' + str(shop_id) + '&_token=' + str(get_token(
        shop_url)) + '&tcv=ck9rmnrofg&uuid=6ca1f51a-7653-b987-3cd6-95f3aadb13b8.1619854599&platform=1' \
                     '&partner=150&optimusCode=10&originUrl=' + str(shop_url)
    r = requests_util.get_requests(url, request_type='json')
    r_text = requests_util.replace_json_text(r.text, get_font_msg())
    r_json = json.loads(r_text)
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        logger.warning('处理验证码,按任意键继续:', verify_page_url)
        input()
    elif r_json['code'] == 200:
        msg = r_json['msg']['shopInfo']
        shop_name = msg['shopName']
        shop_address = BeautifulSoup(msg['address'],
                                     'lxml').text + BeautifulSoup(
                                         msg['crossRoad'], 'lxml').text
        shop_number = BeautifulSoup(msg['phoneNo'],
                                    'lxml').text + BeautifulSoup(
                                        msg['phoneNo2'], 'lxml').text
        return [shop_name, shop_address, shop_number]
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 3
0
def get_basic_hidden_info(shop_id):
    """
    获取基础隐藏信息(名称、地址、电话号、cityid)
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'http://www.dianping.com/ajax/json/shopDynamic/basicHideInfo?' \
          'shopId=' + str(shop_id) + \
          '&_token=' + str(get_token(shop_url)) + \
          '&tcv=' + str(spider_config.TCV) + \
          '&uuid=' + str(spider_config.UUID) + \
          '&platform=1' \
          '&partner=150' \
          '&optimusCode=10' \
          '&originUrl=' + str(shop_url)
    # 这里处理解决请求会异常的问题
    retry_time = 5
    while True:
        retry_time -= 1
        r = requests_util.get_requests(url, request_type='proxy, no cookie')
        r_text = requests_util.replace_json_text(r.text, get_font_msg())
        try:
            r_json = json.loads(r_text)
            # 前置验证码过滤
            if r_json['code'] == 200:
                break
            if retry_time == 0:
                logger.warning('替换tsv和uuid')
                exit()
        except:
            pass
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        print('处理验证码,按任意键回车后继续:', verify_page_url)
        input()
    elif r_json['code'] == 200:
        msg = r_json['msg']['shopInfo']
        shop_name = msg['shopName']

        shop_address = BeautifulSoup(msg['address'], 'lxml').text if msg['address'] is not None else '' + \
                                                                                                     BeautifulSoup(msg[
                                                                                                                       'crossRoad'],
                                                                                                                   'lxml').text if \
            msg['crossRoad'] is not None else ''
        shop_number = BeautifulSoup(msg['phoneNo'], 'lxml').text if msg['phoneNo'] is not None else '' + ', ' + \
                                                                                                    BeautifulSoup(
                                                                                                        msg['phoneNo2'],
                                                                                                        'lxml').text if \
            msg['phoneNo2'] is not None else ''
        return {
            '店铺id': shop_id,
            '店铺名': shop_name,
            '店铺地址': shop_address,
            '店铺电话': shop_number
        }
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 4
0
def get_review_and_star(shop_id):
    """
    获取评分、人均,评论数
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'http://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?shopId=' + str(
        shop_id) + '&cityId=19&mainCategoryId=2821&_token=' + str(get_token(
        shop_url)) + '&uuid=38af1c67-4a50-3220-06f6-bf9f16e71c41.1611146098&platform=1&partner=150&optimusCode=10' \
                     '&originUrl=' + shop_url
    r = requests_util.get_requests(url, request_type='json')
    r_text = requests_util.replace_json_text(r.text, get_font_msg())
    r_json = json.loads(r_text)
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        logger.warning('处理验证码,按任意键继续:', verify_page_url)
        input()
    elif r_json['code'] == 200:
        shop_base_score = r_json['fiveScore']
        score_title_list = r_json['shopScoreTitleList']
        avg_price = BeautifulSoup(r_json['avgPrice'], 'lxml').text
        review_count = BeautifulSoup(r_json['defaultReviewCount'], 'lxml').text
        score_list = []
        for each in r_json['shopRefinedScoreValueList']:
            score_list.append(BeautifulSoup(each, 'lxml').text)
        scores = ''
        for i, score in enumerate(score_list):
            scores = scores + ' ' + score_title_list[i] + score_list[i]
        return [shop_base_score, scores, avg_price, review_count]
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 5
0
def get_review_and_star(shop_id):
    """
    获取评分、人均,评论数
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'http://www.dianping.com/ajax/json/shopDynamic/reviewAndStar?' \
          'shopId=' + str(shop_id) + \
          '&cityId=19' \
          '&mainCategoryId=2821' \
          '&_token=' + str(get_token(shop_url)) + \
          '&uuid=' + str(spider_config.UUID) + \
          '&platform=1' \
          '&partner=150' \
          '&optimusCode=10' \
          '&originUrl=' + shop_url
    # 这里处理解决请求会异常的问题
    while True:
        r = requests_util.get_requests(url, request_type='proxy, no cookie')
        r_text = requests_util.replace_json_text(r.text, get_font_msg())
        try:
            r_json = json.loads(r_text)
            # 前置验证码过滤
            if r_json['code'] == 200:
                break
        except:
            pass
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        print('处理验证码,按任意键回车后继续:', verify_page_url)
        input()
    elif r_json['code'] == 200:
        shop_base_score = r_json['fiveScore']
        score_title_list = r_json['shopScoreTitleList']
        avg_price = BeautifulSoup(r_json['avgPrice'], 'lxml').text
        review_count = BeautifulSoup(r_json['defaultReviewCount'], 'lxml').text
        score_list = []
        for each in r_json['shopRefinedScoreValueList']:
            score_list.append(BeautifulSoup(each, 'lxml').text)
        # scores = ''
        # for i, score in enumerate(score_list):
        #     scores = scores + ' ' + score_title_list[i] + score_list[i]
        scores = {}
        for i, score in enumerate(score_list):
            scores[score_title_list[i]] = score_list[i]
        # return [shop_base_score, scores, avg_price, review_count]
        return {
            '店铺id': shop_id,
            '店铺总分': shop_base_score,
            '店铺评分': scores,
            '人均价格': avg_price,
            '评论总数': review_count
        }
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 6
0
def download_woff(woff_url, filename):
    """
    下载字体文件
    :param woff_url:
    :param filename:
    :return:
    """
    r = requests_util.get_requests(woff_url, need_header=False)
    with open('./tmp/' + filename, 'wb') as f:
        f.write(r.content)
Ejemplo n.º 7
0
 def get_detail_font_mapping(self, shop_id):
     """
     获取detail的字体映射,不要解析,只要加密字体映射,给json用
     @param shop_id:
     @return:
     """
     url = 'http://www.dianping.com/shop/' + str(shop_id)
     r = requests_util.get_requests(url, request_type='proxy, no cookie')
     # 对于部分敏感ip(比如我的ip,淦!)可能需要带cookie才允许访问
     if r.status_code == 403:
         r = requests_util.get_requests(url,
                                        request_type='no proxy, cookie')
         if r.status_code == 403:
             logger.error('使用代理吧小伙汁')
             exit()
     text = r.text
     file_map = get_search_map_file(text)
     cache.search_font_map = file_map
     return file_map
Ejemplo n.º 8
0
    def search(self, key_word, only_need_first=True, needed_pages=50):
        """
        搜索
        :param key_word: 关键字
        :param only_need_first: 只需要第一条
        :param needed_pages: 需要多少页
        :return:
        """
        # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie
        assert isinstance(key_word, str)
        assert key_word != None or key_word.strip() != ''
        if self.custom_search_url != '':
            key_word = self.custom_search_url
        logger.info('开始搜索:' + key_word)
        # header = self.get_header()
        for i in tqdm(range(1, needed_pages + 1), desc='页数'):
            # 针对只需要收条的情况,跳出页数循环
            if only_need_first is True and i != 1:
                break

            url = 'http://www.dianping.com/search/keyword/' + str(
                self.location_id) + '/' + str(
                    self.channel_id) + '_' + str(key_word) + '/p' + str(i)
            if self.custom_search_url != '':
                url = self.custom_search_url + str(i)
            r = requests_util.get_requests(url)
            # r = requests.get(url, headers=header)
            text = r.text
            # 获取加密文件
            file_map = get_search_map_file(text)
            # 替换加密文件
            text = requests_util.replace_search_html(text, file_map)

            # 网页解析
            html = BeautifulSoup(text, 'lxml')
            shop_all_list = html.select('.shop-list')[0].select('li')

            search_res = []
            for shop in shop_all_list:
                try:
                    image_path = shop.select('.pic')[0].select('a')[0].select(
                        'img')[0]['src']
                except:
                    image_path = '-'
                try:
                    shop_id = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0]['data-shopid']
                except:
                    shop_id = '-'
                try:
                    detail_url = shop.select('.txt')[0].select(
                        '.tit')[0].select('a')[0]['href']
                except:
                    detail_url = '-'
                try:
                    name = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0].text.strip()
                except:
                    name = '-'
                # 两个star方式,有的页面显示详细star分数,有的显示icon
                # 解析icon
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][
                            1].split('_')[1]
                    star_point = float(star_point) / 10
                    star_point = str(star_point)
                except:
                    star_point = '-'
                # 解析详细star
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text
                    star_point = float(star_point)
                    star_point = str(star_point)
                except:
                    pass
                try:
                    review_number = shop.select('.txt')[0].select(
                        '.comment')[0].select('.review-num')[0].text.replace(
                            '\n', '')
                except:
                    review_number = '-'
                try:
                    mean_price = shop.select('.txt')[0].select('.comment')[
                        0].select('.mean-price')[0].select('b')[0].text
                except:
                    mean_price = '¥0'
                try:
                    tags = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.tag')
                    tag1 = tags[0].text.replace('\n', ' ').strip()
                    tag2 = tags[1].text.replace('\n', ' ').strip()
                except:
                    tag1 = '-'
                    tag2 = '-'
                try:
                    addr = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.addr')[0].text.replace(
                            '\n', ' ').strip()
                except:
                    addr = '-'
                try:
                    recommend = shop.select('.recommend')[0].text.replace(
                        '\n', ' ').strip()
                except:
                    recommend = '-'
                try:
                    commend_list = shop.select(
                        '.comment-list')[0].text.replace('\n', ' ').strip()
                except:
                    commend_list = '-'
                one_step_search_res = [
                    shop_id, name, star_point, review_number, mean_price, tag1,
                    tag2, addr, recommend, commend_list, image_path,
                    detail_url, 1, 1
                ]  # 最后两位是搜索标记
                # 这个数据结构暂时没用
                search_res.append(one_step_search_res)
                # 只要首条,跳出
                if only_need_first is True:
                    break
                # 解析详情页
                if self.need_detail == '1':
                    try:
                        detail = Detail().get_detail(shop_id)
                        print('\n' + ','.join(detail) + '\n')
                        self.saver.save_data([detail], 'detail')
                    except:
                        # 设置标记
                        one_step_search_res[-2] = 0
                        logger.warning('详情信息获取失败,失败id:' + shop_id)
                        print('\n' + ','.join(one_step_search_res) + '\n')
                        if self.jump_wait is False:
                            print(
                                '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查',
                                'http://www.dianping.com/shop/' + str(shop_id))
                            if input() == 'y':
                                continue
                            elif input() == 'n':
                                self.jump_wait = True
                else:
                    print('\n' + ','.join(one_step_search_res) + '\n')
                # 解析评论页
                if self.need_comment == '1':
                    try:
                        review = Review().get_review(shop_id)
                        print('获取', name, '评论', len(review), '条')
                        self.saver.save_data(review, 'review')
                    except:
                        # 设置标记
                        one_step_search_res[-1] = 0
                        logger.warning('评论获取失败,失败id:' + shop_id)

                # 保存数据
                self.saver.save_data([one_step_search_res], 'search')
        logger.info('解析完成:' + key_word)
Ejemplo n.º 9
0
def get_search_map_file(page_source):
    """
    获取搜索页映射文件
    :param page_source: 页面源码
    :return:
    """
    # 创建临时缓存文件夹
    create_dir('./tmp')
    # 返回json映射
    return_file_map = {}
    # 如果无法在页面信息中解析出字体css文件,说明被反爬或者cookie失效
    try:
        font_base_url = re.findall(' href="(//s3plus.meituan.net/v1/.*?)">',
                                   page_source)[0]
    except:
        global_logger.warning('cookie失效或者被限制访问,更新cookie或登录大众点评滑动验证')
        sys.exit()
    # global_logger.info('更新搜索页面加密字体映射文件')
    font_base_url = 'https:' + font_base_url
    # header = get_header()
    # r = requests.get(font_base_url, headers=header)
    r = requests_util.get_requests(url=font_base_url, need_header=False)
    text = r.text
    woff_urls = re.findall(',url\("(.*?\.woff"\).*?\{)', text)

    # 设置logger等级,解析woff会生成无关日志,屏蔽
    logger = logging.getLogger()
    logger.setLevel(logging.WARNING)

    # 处理css中的woff链接
    for each in woff_urls:

        # 解析address woff
        if 'address' in each:
            address_map_woff_url = re.findall('(//.*?woff)', each)[0]
            address_map_woff_url = 'https:' + address_map_woff_url
            # 获取文件名
            file_name = address_map_woff_url[-13:-5]
            return_file_map['address'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(address_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'shopNum' in each:
            shop_num_map_woff_url = re.findall('(//.*?woff)', each)[0]
            shop_num_map_woff_url = 'https:' + shop_num_map_woff_url
            # 获取文件名
            file_name = shop_num_map_woff_url[-13:-5]
            return_file_map['shopNum'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(shop_num_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'tagName' in each:
            tag_name_map_woff_url = re.findall('(//.*?woff)', each)[0]
            tag_name_map_woff_url = 'https:' + tag_name_map_woff_url
            # 获取文件名
            file_name = tag_name_map_woff_url[-13:-5]
            return_file_map['tagName'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(tag_name_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'reviewTag' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['reviewTag'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'num' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['num'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'dishname' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['dishname'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'shopdesc' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['shopdesc'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'review' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['review'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')

        if 'hours' in each:
            review_tag_map_woff_url = re.findall('(//.*?woff)', each)[0]
            review_tag_map_woff_url = 'https:' + review_tag_map_woff_url
            # 获取文件名
            file_name = review_tag_map_woff_url[-13:-5]
            return_file_map['hours'] = './tmp/' + file_name + '.json'
            # 如果文件存在不用解析
            if os.path.exists('./tmp/' + file_name + '.json'):
                continue
            # 下载字体文件,解析文件
            download_woff(review_tag_map_woff_url, file_name + '.woff')
            parse_woff(file_name + '.woff')
            parse_xml(file_name + '.xml')
            os.remove('./tmp/' + file_name + '.woff')
            os.remove('./tmp/' + file_name + '.xml')
    # 将logger等级恢复
    logger = logging.getLogger()
    logger.setLevel(logging.INFO)

    # global_logger.info('加密字体映射文件获取完成')
    return return_file_map
Ejemplo n.º 10
0
def get_review_map_file(page_source):
    """
    获取评论页加密文件
    :param page_source:
    :return:
    """
    create_dir('./tmp')
    # 如果无法在页面信息中解析出字体css文件,说明被反爬或者cookie失效
    try:
        css_url = 'https:' + re.findall(
            ' href="(//s3plus.meituan.net/v1/.*?)">', page_source)[0]
    except:
        global_logger.warning('cookie失效或者被限制访问,更新cookie或登录大众点评滑动验证')
        sys.exit()
    # 下载css文件
    r = requests_util.get_requests(css_url, need_header=False)
    with open('./tmp/review_css.css', 'wb') as f:
        f.write(r.content)
    # 解析css文件
    css_role = re.findall('.(.*?)\{background:-(.*?)px -(.*?)px;}', r.text,
                          re.S)
    css_loc = []

    for each in css_role:
        # 过滤css中的svg信息,也会正则出来
        if '[' in each[0]:
            continue
        css_loc.append([each[0], int(float(each[1])), int(float(each[2]))])

    # 解析svg字体
    svg_url = re.findall(
        '\[class\^="(.*?)"\].*?url\((//s3plus.meituan.net/v1/.*?)\)', r.text,
        re.S)
    svg_map = {}
    return_svg_name = {}
    for each in svg_url:
        url = 'https:' + each[1]
        r = requests_util.get_requests(url, need_header=False)
        svg_name = each[1][-18:-3] + 'json'
        # 检查缓存json文件,以节约解析时间
        if os.path.exists('./tmp/' + svg_name):
            return_svg_name[each[0]] = './tmp/' + svg_name
            continue

        # 字体类型,用于区分不同字体的height、weight偏移不同
        if '#333' in r.text:
            font_height_offset = 23
            font_weight_offset = 0
        elif '#666' in r.text:
            font_height_offset = 15
            font_weight_offset = 0
        else:
            global_logger.warning('评论页字体变更,尝试修改代码或者联系作者')
            sys.exit()
        # 第一种文件格式解析
        re_font_loc = re.findall('<path id="(.*?)" d="M0 (.*?) H600"/>',
                                 r.text)
        font_loc = {}
        for i in range(len(re_font_loc)):
            font_loc[int(re_font_loc[i][1])] = i + 1
        font_list = re.findall('>(.*?)</textPath>', r.text)
        # 如果第一种解析失败,尝试第二种文件格式解析
        if len(font_loc) == 0:
            font_loc = {}
            font_list = []
            font_loc_tmp = re.findall('<text x=".*?" y="(.*?)">(.*?)</text>',
                                      r.text)
            for i in range(len(font_loc_tmp)):
                font_loc[int(font_loc_tmp[i][0])] = i + 1
                font_list.append(font_loc_tmp[i][1])

        # Todo 这个svg_map上一个存储结构需要,目前这个存储结构比较冗余,但是为了简单起见继续使用,留给以后重构的时候解决
        svg_map[each[0]] = [
            font_loc, font_list, font_height_offset, font_weight_offset,
            svg_name, each[0]
        ]

        css_map_result = {}
        css_key = each[0][:3]

        # 解析css文件
        for each_css in css_loc:
            if each_css[0][:len(each[0])] != each[0]:
                continue
            loc_x, loc_y = each_css[1], each_css[2]
            # 字体的长宽偏移量
            font_height_offset, font_weight_offset = svg_map[css_key][
                2], svg_map[css_key][3]
            # 计算文字位置
            loc_x_line, loc_y_line = (
                loc_x + font_weight_offset) // 14, svg_map[css_key][0][
                    loc_y + font_height_offset]
            # 获取文字
            css_value = svg_map[css_key][1][loc_y_line - 1][loc_x_line]
            css_map_result[each_css[0]] = css_value
        # 保存json文件
        with open('./tmp/' + str(svg_map[css_key][4]), 'w',
                  encoding='utf-8') as f:
            json.dump(css_map_result, f, ensure_ascii=False)
        return_svg_name[str(
            svg_map[css_key][5])] = './tmp/' + str(svg_map[css_key][4])

    return return_svg_name
Ejemplo n.º 11
0
    def get_detail(self, shop_id, request_type='proxy, cookie'):
        url = 'http://www.dianping.com/shop/' + str(shop_id)
        r = requests_util.get_requests(url, request_type=request_type)
        if r.status_code == 403:
            print('检查浏览器,处理验证码,替换cookie,输入y解除限制',
                  'http://www.dianping.com/shop/' + str(shop_id))
            while input() != 'y':
                import time
                time.sleep(1)
            requests_util.update_cookie()
            r = requests_util.get_requests(url, request_type=request_type)
        text = r.text
        # 获取加密文件
        file_map = get_search_map_file(text)
        # 替换加密字符串
        text = requests_util.replace_search_html(text, file_map)
        # 网页解析
        html = BeautifulSoup(text, 'lxml')
        """
        解析格式1(一般餐饮居多)
        """
        # 基础信息
        main_info = html.select('.main')[0]

        shop_name = '-'
        review_count = '-'
        avg_price = '-'
        score = '-'
        address = '-'
        phone = '-'
        other_info = '-'
        try:
            base_info = main_info.select('#basic-info')[0]
            try:
                shop_name = base_info.select('.shop-name')[0].text
                # 过滤标题后缀,例:手机扫码 优惠买单
                remove_a = base_info.select('a')
                for each in remove_a:
                    shop_name = shop_name.replace(each.text, '')
                shop_name = shop_name.strip()
            except:
                shop_name = '-'
            try:
                brief_info = main_info.select('.brief-info')[0]
                # Todo 单独json接口响应,js加密参数,由后期慢慢解决,但是仍然保留这个字段,其他解析方式有时可以解析这个字段
                # try:
                #     score = brief_info.select('.star-wrapper')[0].select('.mid-score')[0].text.strip()
                # except:
                #     score = None
                try:
                    review_count = brief_info.select(
                        '#reviewCount')[0].text.strip()
                except:
                    review_count = '-'
                try:
                    avg_price = brief_info.select(
                        '#avgPriceTitle')[0].text.strip()
                except:
                    avg_price = '-'

                # Todo 这个建议使用info中信息,这里的有可能会不准,动态参数由json返回
                # try:
                #     comment_score = brief_info.select('#comment_score')[0].text.strip()
                # except:
                #     comment_score = None

                try:
                    address = main_info.find(attrs={
                        'itemprop': 'street-address'
                    }).text.strip()
                except:
                    address = '-'

                try:
                    phone = main_info.select('.tel')[0].text.strip()
                except:
                    phone = '-'

                try:
                    other_info = main_info.select('.other')[0].text.replace(
                        '修改', '').strip()
                except:
                    other_info = '-'
            except:
                # Todo 前台显示手动滑动解锁
                # self.get_detail(shop_id)
                pass
            # Todo 促销信息 (单独接口 js加密)
            # try:
            #     sale_info = ''
            #     sales = main_info.select('#sales')
            #     for sale in sales:
            #         for tag in sale.select('.item'):
            #             try:
            #                 title = tag.select('.title')[0].text
            #                 price = tag.select('.price')[0].text
            #                 del_price = tag.select('.del-price')[0].text
            #                 sale_info += title + '\t' + price + '\t' + del_price + '\n'
            #             except:
            #                 continue
            # except:
            #     sales = None
        except:
            # 切换解析方式
            pass
        """
        解析格式2(一般酒店居多)
        """
        # Todo 这种解析方式没有加密,会在解析加密文件时报错,反正这种格式数量不多,暂时不做更改了
        # if shop_name is '-':
        #     # 名称解析不到,换一种解析方式
        #     try:
        #         base_info = html.select('base-info')[0]
        #         try:
        #             shop_name = base_info.select('.hotel-title')[0].text
        #         except:
        #             shop_name = None
        #         try:
        #             address = base_info.find(attrs={'itemprop': 'address'}).text.strip()
        #         except:
        #             address = None
        #         try:
        #             score = base_info.select('.hotel-scope')[0].select('.score')[0].text
        #         except:
        #             score = None
        #     except:
        #         # Todo 前台显示手动滑动解锁
        #         # self.get_detail(shop_id)
        #         pass
        #     pass
        detail_info = {
            '店铺id': shop_id,
            '店铺名': shop_name,
            '评论总数': review_count,
            '人均价格': avg_price,
            '店铺地址': address,
            '店铺电话': phone,
            '其他信息': other_info
        }
        return detail_info
Ejemplo n.º 12
0
def get_basic_review(shop_id):
    """
    获取评分、人均,评论数
    @param shop_id:
    @return:
    """
    assert len(shop_id) == len('H2noKWCDigM0H9c1')
    shop_url = get_shop_url(shop_id)
    url = 'http://www.dianping.com/ajax/json/shopDynamic/allReview?' \
          'shopId=' + str(shop_id) + \
          '&cityId=19' \
          '&shopType=10' \
          '&tcv=' + str(spider_config.TCV) + \
          '&_token=' + str(get_token(shop_url)) + \
          '&uuid=' + str(spider_config.UUID) + \
          '&platform=1' \
          '&partner=150' \
          '&optimusCode=10' \
          '&originUrl=' + shop_url
    # 这里处理解决请求会异常的问题
    while True:
        r = requests_util.get_requests(url, request_type='proxy, no cookie')
        r_text = requests_util.replace_json_text(r.text, get_font_msg())
        try:
            r_json = json.loads(r_text)
            # 前置验证码过滤
            if r_json['code'] == 200:
                break
        except:
            pass
    # 验证码处理
    if r_json['code'] == 406:
        verify_page_url = r_json['customData']['verifyPageUrl']
        print('处理验证码,按任意键回车后继续:', verify_page_url)
        input()
        get_basic_review(shop_id)
    elif r_json['code'] == 200:
        # 获取评论的标签以及每个标签的个数
        summaries = []
        for summary in r_json['summarys']:
            summaries.append({
                '描述': summary['summaryString'],
                '个数': summary['summaryCount']
            })

        # 获取评论数量信息
        all_review_count = r_json['reviewCountAll']
        review_with_pic_count = r_json['reviewCountPic']
        good_review_count = r_json['reviewCountGood']
        mid_review_count = r_json['reviewCountCommon']
        bad_review_count = r_json['reviewCountBad']

        # 获取精选评论详情信息
        reviews = []
        for review in r_json['reviewAllDOList']:
            # 基础评论信息
            review_info = review['reviewDataVO']
            review_id = review_info['reviewData']['reviewId']
            review_star = review_info['reviewData']['star']
            review_body = BeautifulSoup(review_info['reviewData']['reviewBody'], 'lxml').text
            review_vote_count = review_info['reviewData']['voteCount']
            review_reply_count = review_info['reviewData']['replyCount']
            review_view_count = review_info['reviewData']['viewCount']

            # 喜欢的菜
            if review_info['reviewData']['extInfoList'] is not None:
                review_like_dish = review_info['reviewData']['extInfoList'][0]['values']
            else:
                review_like_dish = []

            review_avg_price = review_info['reviewData']['avgPrice']
            review_publish_time = review_info['addTimeVO']
            # 商家回复
            review_merchant_reply = review_info['followNoteString']

            # 用户评论图片
            if review['picList'] is not None:
                review_pic_list = []
                for each_pic in review['picList']:
                    review_pic_list.append(each_pic['bigPicture'])
            else:
                review_pic_list = []

            # 获取用户相关信息
            review_username = review['user']['userNickName']
            user_id = review['user']['userId']

            # each_review = [shop_id, review_id, user_id, review_username, review_star, review_body, review_vote_count,
            #                review_reply_count, review_view_count, review_avg_price, review_like_dish,
            #                review_publish_time, review_merchant_reply, review_pic_list]
            each_review = {
                '店铺id': shop_id,
                '评论id': review_id,
                '用户id': user_id,
                '用户名': review_username,
                '用户打分': review_star,
                '评论内容': review_body,
                '点赞个数': review_vote_count,
                '回复个数': review_reply_count,
                '浏览次数': review_view_count,
                '人均价格': review_avg_price,
                '喜欢的菜': review_like_dish,
                '发布时间': review_publish_time,
                '商家回复': review_merchant_reply,
                '评论图片': review_pic_list,
            }
            reviews.append(each_review)

        # 推荐菜
        dish_tag_list = r_json['dishTagStrList']

        # return [summaries, all_review_count, good_review_count, mid_review_count, bad_review_count,
        #         review_with_pic_count, reviews, dish_tag_list]
        return {
            '店铺id': shop_id,
            '评论摘要': summaries,
            '评论总数': all_review_count,
            '好评个数': good_review_count,
            '中评个数': mid_review_count,
            '差评个数': bad_review_count,
            '带图评论个数': review_with_pic_count,
            '精选评论': reviews,
            '推荐菜': dish_tag_list,
        }
    else:
        logger.warning('json响应码异常,尝试更改提pr,或者提issue')
Ejemplo n.º 13
0
    def search(self, search_url, request_type='proxy, cookie'):
        """
        搜索
        :param key_word: 关键字
        :param only_need_first: 只需要第一条
        :param needed_pages: 需要多少页
        :return:
        """
        r = requests_util.get_requests(search_url, request_type=request_type)
        text = r.text
        # 获取加密文件
        file_map = get_search_map_file(text)
        # 替换加密文件
        text = requests_util.replace_search_html(text, file_map)

        # 网页解析
        html = BeautifulSoup(text, 'lxml')
        shop_all_list = html.select('.shop-list')[0].select('li')

        search_res = []
        for shop in shop_all_list:
            try:
                image_path = shop.select('.pic')[0].select('a')[0].select('img')[0]['src']
            except:
                image_path = '-'
            try:
                shop_id = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['data-shopid']
            except:
                shop_id = '-'
            try:
                detail_url = shop.select('.txt')[0].select('.tit')[0].select('a')[0]['href']
            except:
                detail_url = '-'
            try:
                name = shop.select('.txt')[0].select('.tit')[0].select('a')[0].text.strip()
            except:
                name = '-'
            # 两个star方式,有的页面显示详细star分数,有的显示icon
            # 解析icon
            try:
                star_point = \
                    shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][
                        1].split('_')[1]
                star_point = float(star_point) / 10
                star_point = str(star_point)
            except:
                star_point = '-'
            # 解析详细star
            try:
                star_point = \
                    shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text
                star_point = float(star_point)
                star_point = str(star_point)
            except:
                pass
            try:
                review_number = shop.select('.txt')[0].select('.comment')[0].select('.review-num')[0].text.replace(
                    '\n', '')
            except:
                review_number = '-'
            try:
                mean_price = shop.select('.txt')[0].select('.comment')[0].select('.mean-price')[0].select('b')[
                    0].text
            except:
                mean_price = '¥0'
            try:
                tags = shop.select('.txt')[0].select('.tag-addr')[0].select('.tag')
                tag1 = tags[0].text.replace('\n', ' ').strip()
                tag2 = tags[1].text.replace('\n', ' ').strip()
            except:
                tag1 = '-'
                tag2 = '-'
            try:
                addr = shop.select('.txt')[0].select('.tag-addr')[0].select('.addr')[0].text.replace('\n',
                                                                                                     ' ').strip()
            except:
                addr = '-'
            try:
                recommend = shop.select('.recommend')[0].text.replace('\n', ' ').strip()
            except:
                recommend = '-'
            try:
                comment_list = shop.select('.comment-list')[0].text.replace('\n', ' ').strip()
            except:
                comment_list = '-'
            one_step_search_res = {
                '店铺id': shop_id,
                '店铺名': name,
                '评论个数': review_number,
                '人均价格': mean_price,
                '标签1': tag1,
                '标签2': tag2,
                '店铺地址': addr,
                '详情链接': detail_url,
                '图片链接': image_path,
                '详细评分': comment_list,
                '推荐菜': recommend,
                '店铺均分': star_point,
            }
            search_res.append(one_step_search_res)
            # yield one_step_search_res
        return search_res
Ejemplo n.º 14
0
    def get_review(self, shop_id):
        all_pages = -1
        cur_pages = 1
        all_review = []
        while all_pages == -1 or all_pages > 0:
            url = 'http://www.dianping.com/shop/' + str(
                shop_id) + '/review_all/p' + str(cur_pages)
            # 访问p1会触发验证码,因此对第一页单独处理
            if cur_pages == 1:
                url = 'http://www.dianping.com/shop/' + str(
                    shop_id) + '/review_all'
            r = requests_util.get_requests(url, request_type='review')
            if r.status_code == 403:
                logger.warning('评论页请求被ban')
                raise Exception

            text = r.text
            # 获取加密文件
            file_map = get_review_map_file(text)
            # 替换加密字符串
            text = requests_util.replace_review_html(text, file_map)
            html = BeautifulSoup(text, 'lxml')
            # 更新页数
            if all_pages == -1:
                all_pages = min(
                    int(html.select('.reviews-pages')[0].select('a')[-2].text),
                    int(self.pages_needed))

            reviews = html.select('.reviews-items')[0].select('.main-review')
            for review in reviews:
                # single_review = []
                try:
                    user_name = review.select('.name')[0].text.strip()
                except:
                    user_name = '-'
                try:
                    score = review.select('.score')[0].text.replace(
                        ' ', '').replace('\n', ' ').strip()
                except:
                    score = '-'
                try:
                    review_text = review.select('.review-words')[0].text.replace(' ', ''). \
                        replace('收起评价', '').replace('\r', ' ').replace('\n', ' ').strip()
                except:
                    review_text = '-'
                try:
                    like = review.select('.review-recommend')[0].text.replace(' ', '').\
                        replace('\r', ' ').replace('\n', ' ').strip()
                except:
                    like = '-'
                try:
                    time = review.select('.time')[0].text.strip()
                except:
                    time = '-'
                try:
                    review_id = review.select('.actions')[0].select(
                        'a')[0].attrs['data-id']
                except:
                    review_id = '-'
                all_review.append([
                    review_id, shop_id, user_name, score, review_text, like,
                    time
                ])
            cur_pages += 1
            all_pages -= 1
        return all_review
Ejemplo n.º 15
0
    def get_review(self, shop_id, request_type='proxy, cookie'):
        all_pages = -1
        cur_pages = 1
        all_review = []
        while all_pages == -1 or all_pages > 0:
            url = 'http://www.dianping.com/shop/' + str(
                shop_id) + '/review_all/p' + str(cur_pages)
            # 访问p1会触发验证码,因此对第一页单独处理
            if cur_pages == 1:
                url = 'http://www.dianping.com/shop/' + str(
                    shop_id) + '/review_all'
            r = requests_util.get_requests(url, request_type=request_type)
            if r.status_code == 403:
                logger.warning('评论页请求被ban')
                raise Exception

            text = r.text
            # 获取加密文件
            file_map = get_review_map_file(text)
            # 替换加密字符串
            text = requests_util.replace_review_html(text, file_map)
            html = BeautifulSoup(text, 'lxml')
            # 更新页数
            if all_pages == -1:
                all_pages = min(
                    int(html.select('.reviews-pages')[0].select('a')[-2].text),
                    int(self.pages_needed))
                # 只用解析一次的东西比如评论个数也放这里来
                summaries = []
                for summary in html.select('.content')[0].select('span'):
                    tag_string = summary.text.strip().replace('\n', '').split()
                    string = tag_string[0]
                    count = tag_string[1][1:-1]
                    summaries.append({
                        '描述': string,
                        '个数': count,
                    })
                # 各种评论个数
                review_with_pic_count = html.select('.filter-pic')[0].select(
                    '.count')[0].text[1:-1]
                good_review_count = html.select('.filter-good')[0].select(
                    '.count')[0].text[1:-1]
                mid_review_count = html.select('.filter-middle')[0].select(
                    '.count')[0].text[1:-1]
                bad_review_count = html.select('.filter-bad')[0].select(
                    '.count')[0].text[1:-1]
                try:
                    all_review_count = int(good_review_count) + int(
                        mid_review_count) + int(bad_review_count)
                except:
                    all_review_count = '-'

            reviews = html.select('.reviews-items')[0].select('.main-review')
            for review in reviews:
                try:
                    review_username = review.select('.name')[0].text.strip()
                except:
                    review_username = '******'

                try:
                    user_id = review.select('.name')[0]['href'].split('/')[-1]
                except:
                    user_id = '-'

                try:
                    review_score_detail = {}
                    review_avg_price = ''
                    review_score_detail_temp = review.select(
                        '.score')[0].text.replace(' ', '').replace(
                            '\n', ' ').strip().split()
                    for each in review_score_detail_temp:
                        if '人均' in each:
                            review_avg_price = each.split(':')[1].replace(
                                '元', '')
                        else:
                            temp = each.split(':')
                            review_score_detail[temp[0]] = temp[1]
                except:
                    review_score_detail = {}
                    review_avg_price = ''

                try:
                    review_text = review.select('.review-words')[0].text.replace(' ', ''). \
                        replace('收起评价', '').replace('\r', ' ').replace('\n', ' ').strip()
                except:
                    review_text = '-'
                try:
                    review_like_dish = review.select('.review-recommend')[0].text.replace(' ', ''). \
                                           replace('\r', ' ').replace('\n', ' ').strip()[5:].split()
                except:
                    review_like_dish = []
                try:
                    review_publish_time = review.select(
                        '.time')[0].text.strip()
                except:
                    review_publish_time = '-'
                try:
                    review_id = review.select('.actions')[0].select(
                        'a')[0].attrs['data-id']
                except:
                    review_id = '-'

                try:
                    review_pic_list = []
                    review_pic_list_temp = review.select(
                        '.review-pictures')[0].select('a')
                    for each in review_pic_list_temp:
                        url = each['href']
                        review_pic_list.append('http://www.dianping.com' +
                                               str(url))
                except:
                    review_pic_list = []

                try:
                    review_merchant_reply = review.select(
                        '.shop-reply-content')[0].text.strip()
                except:
                    review_merchant_reply = ''

                each_review = {
                    '店铺id': shop_id,
                    '评论id': review_id,
                    '用户id': user_id,
                    '用户名': review_username,
                    '用户打分': review_score_detail,
                    '评论内容': review_text,
                    '人均价格': review_avg_price,
                    '喜欢的菜': review_like_dish,
                    '发布时间': review_publish_time,
                    '商家回复': review_merchant_reply,
                    '评论图片': review_pic_list,
                }
                all_review.append(each_review)
            cur_pages += 1
            all_pages -= 1
        return_data = {
            '店铺id': shop_id,
            '评论摘要': summaries,
            '评论总数': all_review_count,
            '好评个数': good_review_count,
            '中评个数': mid_review_count,
            '差评个数': bad_review_count,
            '带图评论个数': review_with_pic_count,
            '精选评论': all_review,
        }
        return return_data