def __init__(self):
        self.s = Search()
        self.d = Detail()
        self.r = Review()

        # 初始化基础URL
        if spider_config.SEARCH_URL == '':
            keyword = spider_config.KEYWORD
            channel_id = spider_config.CHANNEL_ID
            city_id = spider_config.LOCATION_ID
            self.base_url = 'http://www.dianping.com/search/keyword/' + str(
                city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p'
            pass
        else:
            self.base_url = spider_config.SEARCH_URL
Exemple #2
0
                    type=str,
                    required=False,
                    default='',
                    help='custom shop id')
args = parser.parse_args()
if __name__ == '__main__':
    # args.review = 1
    # args.normal = 0
    # args.shop_id = 'l8QDQukrl2tXhzmY'
    if args.normal == 1:
        keyword = global_config.getRaw('detail', 'keyword')
        need_first = True if global_config.getRaw(
            'detail', 'need_first') is 'True' else False
        need_pages = int(global_config.getRaw('detail', 'need_pages'))

        s = Search()
        s.search(keyword, need_first, need_pages)
    if args.detail == 1:
        from function.detail import Detail

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '详情')
        d = Detail()
        d.get_detail(shop_id)
    if args.review == 1:
        from function.review import Review

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '评论')
        r = Review()
        r.get_review(shop_id)
class Controller():
    """
    整个程序的控制器
    用来进行爬取策略选择以及数据汇总存储
    """
    def __init__(self):
        self.s = Search()
        self.d = Detail()
        self.r = Review()

        # 初始化基础URL
        if spider_config.SEARCH_URL == '':
            keyword = spider_config.KEYWORD
            channel_id = spider_config.CHANNEL_ID
            city_id = spider_config.LOCATION_ID
            self.base_url = 'http://www.dianping.com/search/keyword/' + str(
                city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p'
            pass
        else:
            self.base_url = spider_config.SEARCH_URL

    def main(self):
        """
        调度
        @return:
        """
        # Todo  其实这里挺犹豫是爬取完搜索直接详情还是爬一段详情一段
        #       本着稀释同类型访问频率的原则,暂时采用爬一段详情一段
        # 调用搜索
        for page in tqdm(range(1, spider_config.NEED_SEARCH_PAGES + 1),
                         desc='搜索页数'):
            # 拼凑url
            search_url, request_type = self.get_search_url(page)
            # each_search_res = self.s.search(search_url, request_type)
            """
            {
                '店铺id': -,
                '店铺名': -,
                '评论个数': -,
                '人均价格': -,
                '标签1': -,
                '标签2': -,
                '店铺地址': -,
                '详情链接': -,
                '图片链接': -,
                '详细评分': -,
                '推荐菜': -,
                '店铺均分': -,
            }
            """
            search_res = self.s.search(search_url, request_type)
            for each_search_res in tqdm(search_res, desc='详细爬取'):
                each_detail_res = {}
                each_review_res = {}
                # 爬取详情
                if spider_config.NEED_DETAIL:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_PHONE_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '评论总数': -,
                            '人均价格': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '其他信息': -
                        }
                        """
                        each_detail_res = self.d.get_detail(shop_id)
                        # 多版本爬取格式适配
                        each_detail_res.update({
                            '店铺总分': '-',
                            '店铺评分': '-',
                        })
                    else:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '店铺总分': -,
                            '店铺评分': -,
                            '人均价格': -,
                            '评论总数': -,
                        }
                        """
                        hidden_info = get_basic_hidden_info(shop_id)
                        review_and_star = get_review_and_star(shop_id)
                        each_detail_res.update(hidden_info)
                        each_detail_res.update(review_and_star)
                        # 多版本爬取格式适配
                        each_detail_res.update({'其他信息': '-'})
                # 爬取评论
                if spider_config.NEED_REVIEW:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_REVIEW_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                        }
                        """
                        each_review_res = self.r.get_review(shop_id)
                        each_review_res.update({'推荐菜': '-'})
                    else:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                            '推荐菜': -,
                        }
                        """
                        each_review_res = get_basic_review(shop_id)
                self.saver(each_search_res, each_detail_res, each_review_res)

    def get_review(self, shop_id, detail=False):
        if detail:
            each_review_res = self.r.get_review(shop_id)
        else:
            each_review_res = get_basic_review(shop_id)
        saver.save_data(each_review_res, 'review')

    def get_detail(self, shop_id, detail=False):
        each_detail_res = {}
        if detail:
            each_detail_res = self.d.get_detail(shop_id)
            # 多版本爬取格式适配
            each_detail_res.update({
                '店铺总分': '-',
                '店铺评分': '-',
            })
        else:
            hidden_info = get_basic_hidden_info(shop_id)
            review_and_star = get_review_and_star(shop_id)
            each_detail_res.update(hidden_info)
            each_detail_res.update(review_and_star)
            # 多版本爬取格式适配
            each_detail_res.update({'其他信息': '-'})
        saver.save_data(each_detail_res, 'detail')

    def get_search_url(self, cur_page):
        """
        获取搜索链接
        @param cur_page:
        @return:
        """
        if cur_page == 1:
            # return self.base_url[:-2], 'no proxy, no cookie'
            return self.base_url[:-2], 'proxy, cookie'
        else:
            return self.base_url + str(cur_page), 'proxy, cookie'

    def saver(self, each_search_res, each_detail_res, each_review_res):
        # save search
        saver.save_data(each_search_res, 'search')
        # save detail
        if spider_config.NEED_DETAIL:
            saver.save_data(each_detail_res, 'detail')

        # save review
        if spider_config.NEED_REVIEW:
            saver.save_data(each_review_res, 'review')
Exemple #4
0
cookie = global_config.getRaw('config', 'cookie')
ua = global_config.getRaw('config', 'user-agent')


def get_header():
    """
    获取请求头
    :return:
    """
    header = {'User-Agent': ua, 'Cookie': cookie}
    return header


if __name__ == '__main__':
    # debug search
    Search().search('一方', only_need_first=False, needed_pages=10)

    # debug review font parse
    # header = get_header()
    # url = 'http://www.dianping.com/shop/i24HGIrTSjD3Tcyy/review_all'
    # r = requests.get(url, headers=header)
    # get_review_map_file(r.text)

    # debug requests utils
    # from utils.requests_utils import requests_util
    # print(requests_util.parse_stop_time('5,10;20,100'))
    # for i in range(20):
    #     print(i)
    #     requests_util.get_requests('http://www.baidu.com')

    # debug detail