def __init__(self):
        self.s = Search()
        self.d = Detail()
        self.r = Review()

        # 初始化基础URL
        if spider_config.SEARCH_URL == '':
            keyword = spider_config.KEYWORD
            channel_id = spider_config.CHANNEL_ID
            city_id = spider_config.LOCATION_ID
            self.base_url = 'http://www.dianping.com/search/keyword/' + str(
                city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p'
            pass
        else:
            self.base_url = spider_config.SEARCH_URL
Exemple #2
0
def get_font_msg():
    """
    获取加密字体映射文件,如果常规流程,这一步应该是由search中完成并存入缓存。
    如果冷启动,一次search更新缓存
    @return:
    """
    if cache.search_font_map != {}:
        return cache.search_font_map
    else:
        Detail().get_detail('l3BEUN08X4TT52bm', just_need_map=True)
        return cache.search_font_map
Exemple #3
0
def get_font_msg():
    """
    获取加密字体映射文件,如果常规流程,这一步应该是由search中完成并存入缓存。
    如果冷启动,一次search更新缓存
    @return:
    """
    if cache.search_font_map != {}:
        return cache.search_font_map
    else:
        Detail().get_detail_font_mapping('H2noKWCDigM0H9c1')
        return cache.search_font_map
Exemple #4
0
                    required=False,
                    default='',
                    help='custom shop id')
args = parser.parse_args()
if __name__ == '__main__':
    # args.review = 1
    # args.normal = 0
    # args.shop_id = 'l8QDQukrl2tXhzmY'
    if args.normal == 1:
        keyword = global_config.getRaw('detail', 'keyword')
        need_first = True if global_config.getRaw(
            'detail', 'need_first') is 'True' else False
        need_pages = int(global_config.getRaw('detail', 'need_pages'))

        s = Search()
        s.search(keyword, need_first, need_pages)
    if args.detail == 1:
        from function.detail import Detail

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '详情')
        d = Detail()
        d.get_detail(shop_id)
    if args.review == 1:
        from function.review import Review

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '评论')
        r = Review()
        r.get_review(shop_id)
class Controller():
    """
    整个程序的控制器
    用来进行爬取策略选择以及数据汇总存储
    """
    def __init__(self):
        self.s = Search()
        self.d = Detail()
        self.r = Review()

        # 初始化基础URL
        if spider_config.SEARCH_URL == '':
            keyword = spider_config.KEYWORD
            channel_id = spider_config.CHANNEL_ID
            city_id = spider_config.LOCATION_ID
            self.base_url = 'http://www.dianping.com/search/keyword/' + str(
                city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p'
            pass
        else:
            self.base_url = spider_config.SEARCH_URL

    def main(self):
        """
        调度
        @return:
        """
        # Todo  其实这里挺犹豫是爬取完搜索直接详情还是爬一段详情一段
        #       本着稀释同类型访问频率的原则,暂时采用爬一段详情一段
        # 调用搜索
        for page in tqdm(range(1, spider_config.NEED_SEARCH_PAGES + 1),
                         desc='搜索页数'):
            # 拼凑url
            search_url, request_type = self.get_search_url(page)
            # each_search_res = self.s.search(search_url, request_type)
            """
            {
                '店铺id': -,
                '店铺名': -,
                '评论个数': -,
                '人均价格': -,
                '标签1': -,
                '标签2': -,
                '店铺地址': -,
                '详情链接': -,
                '图片链接': -,
                '详细评分': -,
                '推荐菜': -,
                '店铺均分': -,
            }
            """
            search_res = self.s.search(search_url, request_type)
            for each_search_res in tqdm(search_res, desc='详细爬取'):
                each_detail_res = {}
                each_review_res = {}
                # 爬取详情
                if spider_config.NEED_DETAIL:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_PHONE_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '评论总数': -,
                            '人均价格': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '其他信息': -
                        }
                        """
                        each_detail_res = self.d.get_detail(shop_id)
                        # 多版本爬取格式适配
                        each_detail_res.update({
                            '店铺总分': '-',
                            '店铺评分': '-',
                        })
                    else:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '店铺总分': -,
                            '店铺评分': -,
                            '人均价格': -,
                            '评论总数': -,
                        }
                        """
                        hidden_info = get_basic_hidden_info(shop_id)
                        review_and_star = get_review_and_star(shop_id)
                        each_detail_res.update(hidden_info)
                        each_detail_res.update(review_and_star)
                        # 多版本爬取格式适配
                        each_detail_res.update({'其他信息': '-'})
                # 爬取评论
                if spider_config.NEED_REVIEW:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_REVIEW_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                        }
                        """
                        each_review_res = self.r.get_review(shop_id)
                        each_review_res.update({'推荐菜': '-'})
                    else:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                            '推荐菜': -,
                        }
                        """
                        each_review_res = get_basic_review(shop_id)
                self.saver(each_search_res, each_detail_res, each_review_res)

    def get_review(self, shop_id, detail=False):
        if detail:
            each_review_res = self.r.get_review(shop_id)
        else:
            each_review_res = get_basic_review(shop_id)
        saver.save_data(each_review_res, 'review')

    def get_detail(self, shop_id, detail=False):
        each_detail_res = {}
        if detail:
            each_detail_res = self.d.get_detail(shop_id)
            # 多版本爬取格式适配
            each_detail_res.update({
                '店铺总分': '-',
                '店铺评分': '-',
            })
        else:
            hidden_info = get_basic_hidden_info(shop_id)
            review_and_star = get_review_and_star(shop_id)
            each_detail_res.update(hidden_info)
            each_detail_res.update(review_and_star)
            # 多版本爬取格式适配
            each_detail_res.update({'其他信息': '-'})
        saver.save_data(each_detail_res, 'detail')

    def get_search_url(self, cur_page):
        """
        获取搜索链接
        @param cur_page:
        @return:
        """
        if cur_page == 1:
            # return self.base_url[:-2], 'no proxy, no cookie'
            return self.base_url[:-2], 'proxy, cookie'
        else:
            return self.base_url + str(cur_page), 'proxy, cookie'

    def saver(self, each_search_res, each_detail_res, each_review_res):
        # save search
        saver.save_data(each_search_res, 'search')
        # save detail
        if spider_config.NEED_DETAIL:
            saver.save_data(each_detail_res, 'detail')

        # save review
        if spider_config.NEED_REVIEW:
            saver.save_data(each_review_res, 'review')
Exemple #6
0
    def search(self, key_word, only_need_first=True, needed_pages=50):
        """
        搜索
        :param key_word: 关键字
        :param only_need_first: 只需要第一条
        :param needed_pages: 需要多少页
        :return:
        """
        # Todo 不需要详情页和评论,只需要首页搜索 不需要cookie
        assert isinstance(key_word, str)
        assert key_word != None or key_word.strip() != ''
        if self.custom_search_url != '':
            key_word = self.custom_search_url
        logger.info('开始搜索:' + key_word)
        # header = self.get_header()
        for i in tqdm(range(1, needed_pages + 1), desc='页数'):
            # 针对只需要收条的情况,跳出页数循环
            if only_need_first is True and i != 1:
                break

            url = 'http://www.dianping.com/search/keyword/' + str(
                self.location_id) + '/' + str(
                    self.channel_id) + '_' + str(key_word) + '/p' + str(i)
            if self.custom_search_url != '':
                url = self.custom_search_url + str(i)
            r = requests_util.get_requests(url)
            # r = requests.get(url, headers=header)
            text = r.text
            # 获取加密文件
            file_map = get_search_map_file(text)
            # 替换加密文件
            text = requests_util.replace_search_html(text, file_map)

            # 网页解析
            html = BeautifulSoup(text, 'lxml')
            shop_all_list = html.select('.shop-list')[0].select('li')

            search_res = []
            for shop in shop_all_list:
                try:
                    image_path = shop.select('.pic')[0].select('a')[0].select(
                        'img')[0]['src']
                except:
                    image_path = '-'
                try:
                    shop_id = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0]['data-shopid']
                except:
                    shop_id = '-'
                try:
                    detail_url = shop.select('.txt')[0].select(
                        '.tit')[0].select('a')[0]['href']
                except:
                    detail_url = '-'
                try:
                    name = shop.select('.txt')[0].select('.tit')[0].select(
                        'a')[0].text.strip()
                except:
                    name = '-'
                # 两个star方式,有的页面显示详细star分数,有的显示icon
                # 解析icon
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_icon')[0].select('span')[0]['class'][
                            1].split('_')[1]
                    star_point = float(star_point) / 10
                    star_point = str(star_point)
                except:
                    star_point = '-'
                # 解析详细star
                try:
                    star_point = \
                        shop.select('.txt')[0].select('.comment')[0].select('.star_score')[0].text
                    star_point = float(star_point)
                    star_point = str(star_point)
                except:
                    pass
                try:
                    review_number = shop.select('.txt')[0].select(
                        '.comment')[0].select('.review-num')[0].text.replace(
                            '\n', '')
                except:
                    review_number = '-'
                try:
                    mean_price = shop.select('.txt')[0].select('.comment')[
                        0].select('.mean-price')[0].select('b')[0].text
                except:
                    mean_price = '¥0'
                try:
                    tags = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.tag')
                    tag1 = tags[0].text.replace('\n', ' ').strip()
                    tag2 = tags[1].text.replace('\n', ' ').strip()
                except:
                    tag1 = '-'
                    tag2 = '-'
                try:
                    addr = shop.select('.txt')[0].select(
                        '.tag-addr')[0].select('.addr')[0].text.replace(
                            '\n', ' ').strip()
                except:
                    addr = '-'
                try:
                    recommend = shop.select('.recommend')[0].text.replace(
                        '\n', ' ').strip()
                except:
                    recommend = '-'
                try:
                    commend_list = shop.select(
                        '.comment-list')[0].text.replace('\n', ' ').strip()
                except:
                    commend_list = '-'
                one_step_search_res = [
                    shop_id, name, star_point, review_number, mean_price, tag1,
                    tag2, addr, recommend, commend_list, image_path,
                    detail_url, 1, 1
                ]  # 最后两位是搜索标记
                # 这个数据结构暂时没用
                search_res.append(one_step_search_res)
                # 只要首条,跳出
                if only_need_first is True:
                    break
                # 解析详情页
                if self.need_detail == '1':
                    try:
                        detail = Detail().get_detail(shop_id)
                        print('\n' + ','.join(detail) + '\n')
                        self.saver.save_data([detail], 'detail')
                    except:
                        # 设置标记
                        one_step_search_res[-2] = 0
                        logger.warning('详情信息获取失败,失败id:' + shop_id)
                        print('\n' + ','.join(one_step_search_res) + '\n')
                        if self.jump_wait is False:
                            print(
                                '检查浏览器,处理验证码,输入y程序继续运行,输入n跳过检查',
                                'http://www.dianping.com/shop/' + str(shop_id))
                            if input() == 'y':
                                continue
                            elif input() == 'n':
                                self.jump_wait = True
                else:
                    print('\n' + ','.join(one_step_search_res) + '\n')
                # 解析评论页
                if self.need_comment == '1':
                    try:
                        review = Review().get_review(shop_id)
                        print('获取', name, '评论', len(review), '条')
                        self.saver.save_data(review, 'review')
                    except:
                        # 设置标记
                        one_step_search_res[-1] = 0
                        logger.warning('评论获取失败,失败id:' + shop_id)

                # 保存数据
                self.saver.save_data([one_step_search_res], 'search')
        logger.info('解析完成:' + key_word)
Exemple #7
0
    # Search().search('一方', only_need_first=False, needed_pages=2)

    # debug review font parse
    # header = get_header()
    # url = 'http://www.dianping.com/shop/i24HGIrTSjD3Tcyy/review_all'
    # r = requests.get(url, headers=header)
    # get_review_map_file(r.text)

    # debug requests utils
    # from utils.requests_utils import requests_util
    # # print(requests_util.parse_stop_time('5,10;20,100'))
    # requests_util.get_requests('http://www.baidu.com')
    # print(1)
    # requests_util.get_requests('http://www.baidu.com')
    # print(2)
    # requests_util.get_requests('http://www.baidu.com')
    # print(3)
    # requests_util.get_requests('http://www.baidu.com')
    # print(4)
    # requests_util.get_requests('http://www.baidu.com')
    # print(5)
    # requests_util.get_requests('http://www.baidu.com')
    # print(6)
    # requests_util.get_requests('http://www.baidu.com')

    # debug detail
    from function.detail import Detail

    Detail().get_detail('k55CTXmrQdpFgFaf')
    pass