Ejemplo n.º 1
0
                    required=False,
                    default='',
                    help='custom shop id')
args = parser.parse_args()
if __name__ == '__main__':
    # args.review = 1
    # args.normal = 0
    # args.shop_id = 'l8QDQukrl2tXhzmY'
    if args.normal == 1:
        keyword = global_config.getRaw('detail', 'keyword')
        need_first = True if global_config.getRaw(
            'detail', 'need_first') is 'True' else False
        need_pages = int(global_config.getRaw('detail', 'need_pages'))

        s = Search()
        s.search(keyword, need_first, need_pages)
    if args.detail == 1:
        from function.detail import Detail

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '详情')
        d = Detail()
        d.get_detail(shop_id)
    if args.review == 1:
        from function.review import Review

        shop_id = args.shop_id
        logger.info('爬取店铺id:' + shop_id + '评论')
        r = Review()
        r.get_review(shop_id)
Ejemplo n.º 2
0
class Controller():
    """
    整个程序的控制器
    用来进行爬取策略选择以及数据汇总存储
    """
    def __init__(self):
        self.s = Search()
        self.d = Detail()
        self.r = Review()

        # 初始化基础URL
        if spider_config.SEARCH_URL == '':
            keyword = spider_config.KEYWORD
            channel_id = spider_config.CHANNEL_ID
            city_id = spider_config.LOCATION_ID
            self.base_url = 'http://www.dianping.com/search/keyword/' + str(
                city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p'
            pass
        else:
            self.base_url = spider_config.SEARCH_URL

    def main(self):
        """
        调度
        @return:
        """
        # Todo  其实这里挺犹豫是爬取完搜索直接详情还是爬一段详情一段
        #       本着稀释同类型访问频率的原则,暂时采用爬一段详情一段
        # 调用搜索
        for page in tqdm(range(1, spider_config.NEED_SEARCH_PAGES + 1),
                         desc='搜索页数'):
            # 拼凑url
            search_url, request_type = self.get_search_url(page)
            # each_search_res = self.s.search(search_url, request_type)
            """
            {
                '店铺id': -,
                '店铺名': -,
                '评论个数': -,
                '人均价格': -,
                '标签1': -,
                '标签2': -,
                '店铺地址': -,
                '详情链接': -,
                '图片链接': -,
                '详细评分': -,
                '推荐菜': -,
                '店铺均分': -,
            }
            """
            search_res = self.s.search(search_url, request_type)
            for each_search_res in tqdm(search_res, desc='详细爬取'):
                each_detail_res = {}
                each_review_res = {}
                # 爬取详情
                if spider_config.NEED_DETAIL:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_PHONE_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '评论总数': -,
                            '人均价格': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '其他信息': -
                        }
                        """
                        each_detail_res = self.d.get_detail(shop_id)
                        # 多版本爬取格式适配
                        each_detail_res.update({
                            '店铺总分': '-',
                            '店铺评分': '-',
                        })
                    else:
                        """
                        {
                            '店铺id': -,
                            '店铺名': -,
                            '店铺地址': -,
                            '店铺电话': -,
                            '店铺总分': -,
                            '店铺评分': -,
                            '人均价格': -,
                            '评论总数': -,
                        }
                        """
                        hidden_info = get_basic_hidden_info(shop_id)
                        review_and_star = get_review_and_star(shop_id)
                        each_detail_res.update(hidden_info)
                        each_detail_res.update(review_and_star)
                        # 多版本爬取格式适配
                        each_detail_res.update({'其他信息': '-'})
                # 爬取评论
                if spider_config.NEED_REVIEW:
                    shop_id = each_search_res['店铺id']
                    if spider_config.NEED_REVIEW_DETAIL:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                        }
                        """
                        each_review_res = self.r.get_review(shop_id)
                        each_review_res.update({'推荐菜': '-'})
                    else:
                        """
                        {
                            '店铺id': -,
                            '评论摘要': -,
                            '评论总数': -,
                            '好评个数': -,
                            '中评个数': -,
                            '差评个数': -,
                            '带图评论个数': -,
                            '精选评论': -,
                            '推荐菜': -,
                        }
                        """
                        each_review_res = get_basic_review(shop_id)
                self.saver(each_search_res, each_detail_res, each_review_res)

    def get_review(self, shop_id, detail=False):
        if detail:
            each_review_res = self.r.get_review(shop_id)
        else:
            each_review_res = get_basic_review(shop_id)
        saver.save_data(each_review_res, 'review')

    def get_detail(self, shop_id, detail=False):
        each_detail_res = {}
        if detail:
            each_detail_res = self.d.get_detail(shop_id)
            # 多版本爬取格式适配
            each_detail_res.update({
                '店铺总分': '-',
                '店铺评分': '-',
            })
        else:
            hidden_info = get_basic_hidden_info(shop_id)
            review_and_star = get_review_and_star(shop_id)
            each_detail_res.update(hidden_info)
            each_detail_res.update(review_and_star)
            # 多版本爬取格式适配
            each_detail_res.update({'其他信息': '-'})
        saver.save_data(each_detail_res, 'detail')

    def get_search_url(self, cur_page):
        """
        获取搜索链接
        @param cur_page:
        @return:
        """
        if cur_page == 1:
            # return self.base_url[:-2], 'no proxy, no cookie'
            return self.base_url[:-2], 'proxy, cookie'
        else:
            return self.base_url + str(cur_page), 'proxy, cookie'

    def saver(self, each_search_res, each_detail_res, each_review_res):
        # save search
        saver.save_data(each_search_res, 'search')
        # save detail
        if spider_config.NEED_DETAIL:
            saver.save_data(each_detail_res, 'detail')

        # save review
        if spider_config.NEED_REVIEW:
            saver.save_data(each_review_res, 'review')