def __init__(self): self.s = Search() self.d = Detail() self.r = Review() # 初始化基础URL if spider_config.SEARCH_URL == '': keyword = spider_config.KEYWORD channel_id = spider_config.CHANNEL_ID city_id = spider_config.LOCATION_ID self.base_url = 'http://www.dianping.com/search/keyword/' + str( city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p' pass else: self.base_url = spider_config.SEARCH_URL
type=str, required=False, default='', help='custom shop id') args = parser.parse_args() if __name__ == '__main__': # args.review = 1 # args.normal = 0 # args.shop_id = 'l8QDQukrl2tXhzmY' if args.normal == 1: keyword = global_config.getRaw('detail', 'keyword') need_first = True if global_config.getRaw( 'detail', 'need_first') is 'True' else False need_pages = int(global_config.getRaw('detail', 'need_pages')) s = Search() s.search(keyword, need_first, need_pages) if args.detail == 1: from function.detail import Detail shop_id = args.shop_id logger.info('爬取店铺id:' + shop_id + '详情') d = Detail() d.get_detail(shop_id) if args.review == 1: from function.review import Review shop_id = args.shop_id logger.info('爬取店铺id:' + shop_id + '评论') r = Review() r.get_review(shop_id)
class Controller(): """ 整个程序的控制器 用来进行爬取策略选择以及数据汇总存储 """ def __init__(self): self.s = Search() self.d = Detail() self.r = Review() # 初始化基础URL if spider_config.SEARCH_URL == '': keyword = spider_config.KEYWORD channel_id = spider_config.CHANNEL_ID city_id = spider_config.LOCATION_ID self.base_url = 'http://www.dianping.com/search/keyword/' + str( city_id) + '/' + str(channel_id) + '_' + str(keyword) + '/p' pass else: self.base_url = spider_config.SEARCH_URL def main(self): """ 调度 @return: """ # Todo 其实这里挺犹豫是爬取完搜索直接详情还是爬一段详情一段 # 本着稀释同类型访问频率的原则,暂时采用爬一段详情一段 # 调用搜索 for page in tqdm(range(1, spider_config.NEED_SEARCH_PAGES + 1), desc='搜索页数'): # 拼凑url search_url, request_type = self.get_search_url(page) # each_search_res = self.s.search(search_url, request_type) """ { '店铺id': -, '店铺名': -, '评论个数': -, '人均价格': -, '标签1': -, '标签2': -, '店铺地址': -, '详情链接': -, '图片链接': -, '详细评分': -, '推荐菜': -, '店铺均分': -, } """ search_res = self.s.search(search_url, request_type) for each_search_res in tqdm(search_res, desc='详细爬取'): each_detail_res = {} each_review_res = {} # 爬取详情 if spider_config.NEED_DETAIL: shop_id = each_search_res['店铺id'] if spider_config.NEED_PHONE_DETAIL: """ { '店铺id': -, '店铺名': -, '评论总数': -, '人均价格': -, '店铺地址': -, '店铺电话': -, '其他信息': - } """ each_detail_res = self.d.get_detail(shop_id) # 多版本爬取格式适配 each_detail_res.update({ '店铺总分': '-', '店铺评分': '-', }) else: """ { '店铺id': -, '店铺名': -, '店铺地址': -, '店铺电话': -, '店铺总分': -, '店铺评分': -, '人均价格': -, '评论总数': -, } """ hidden_info = get_basic_hidden_info(shop_id) review_and_star = get_review_and_star(shop_id) each_detail_res.update(hidden_info) each_detail_res.update(review_and_star) # 多版本爬取格式适配 each_detail_res.update({'其他信息': '-'}) # 爬取评论 if spider_config.NEED_REVIEW: shop_id = each_search_res['店铺id'] if spider_config.NEED_REVIEW_DETAIL: """ { '店铺id': -, '评论摘要': -, '评论总数': -, '好评个数': -, '中评个数': -, '差评个数': -, '带图评论个数': -, '精选评论': -, } """ each_review_res = self.r.get_review(shop_id) each_review_res.update({'推荐菜': '-'}) else: """ { '店铺id': -, '评论摘要': -, '评论总数': -, '好评个数': -, '中评个数': -, '差评个数': -, '带图评论个数': -, '精选评论': -, '推荐菜': -, } """ each_review_res = get_basic_review(shop_id) self.saver(each_search_res, each_detail_res, each_review_res) def get_review(self, shop_id, detail=False): if detail: each_review_res = self.r.get_review(shop_id) else: each_review_res = get_basic_review(shop_id) saver.save_data(each_review_res, 'review') def get_detail(self, shop_id, detail=False): each_detail_res = {} if detail: each_detail_res = self.d.get_detail(shop_id) # 多版本爬取格式适配 each_detail_res.update({ '店铺总分': '-', '店铺评分': '-', }) else: hidden_info = get_basic_hidden_info(shop_id) review_and_star = get_review_and_star(shop_id) each_detail_res.update(hidden_info) each_detail_res.update(review_and_star) # 多版本爬取格式适配 each_detail_res.update({'其他信息': '-'}) saver.save_data(each_detail_res, 'detail') def get_search_url(self, cur_page): """ 获取搜索链接 @param cur_page: @return: """ if cur_page == 1: # return self.base_url[:-2], 'no proxy, no cookie' return self.base_url[:-2], 'proxy, cookie' else: return self.base_url + str(cur_page), 'proxy, cookie' def saver(self, each_search_res, each_detail_res, each_review_res): # save search saver.save_data(each_search_res, 'search') # save detail if spider_config.NEED_DETAIL: saver.save_data(each_detail_res, 'detail') # save review if spider_config.NEED_REVIEW: saver.save_data(each_review_res, 'review')
cookie = global_config.getRaw('config', 'cookie') ua = global_config.getRaw('config', 'user-agent') def get_header(): """ 获取请求头 :return: """ header = {'User-Agent': ua, 'Cookie': cookie} return header if __name__ == '__main__': # debug search Search().search('一方', only_need_first=False, needed_pages=10) # debug review font parse # header = get_header() # url = 'http://www.dianping.com/shop/i24HGIrTSjD3Tcyy/review_all' # r = requests.get(url, headers=header) # get_review_map_file(r.text) # debug requests utils # from utils.requests_utils import requests_util # print(requests_util.parse_stop_time('5,10;20,100')) # for i in range(20): # print(i) # requests_util.get_requests('http://www.baidu.com') # debug detail