class ParseDeliveymode(object): def __init__(self,resource_path, city, date,save_date_short,db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.db = db self.sql = "insert into t_e_delivery_mode_city_pre_%s values" % save_date_short self.batch = BatchSql(self.sql) def parse_data(self, data): try: rest_id = str(data['param']) if len(rest_id) <= 11: data = data['data'].get('delivery_mode') delever_id = getMapValue(data,"id") delever_text = getMapValue(data, "text") param = [self.city, self.date,rest_id,delever_text,delever_id] self.batch.addBatch(param) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) except Exception as e: print(traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'rest_info.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
class ParseRatingTagObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, err_rest_ids, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = err_rest_ids self.city = city self.date = date self.rest_area = rest_area self.db = db sql = "insert into t_e_rest_rating_tag_import values" self.batch1 = BatchSql(sql) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() def parse_data(self, data): try: rest_id = data['param'] if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: for item in data["data"]: param1 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(item, "name"), getMapValue(item, "count") ] self.insert(self.batch1, param1) else: print("错误数据", data) except Exception as e: print("解析数据报错", e)
def deal_quan_category(city,date): ''' 处理店圈的菜品类信息,一级品类,二级品类信息 :param city: :param date: :return: ''' sql = """ select DISTINCT(t1.category_id_level1),t1.category_name_level1,0 pid from t_e_rest_category_city_1712 t1 where t1.city = '%s' and t1.date = '%s' union select DISTINCT(t1.category_id_level2),t1.category_name_level2,t1.category_id_level1 pid from t_e_rest_category_city_1712 t1 where t1.city = '%s' and t1.date = '%s' """%(city,date,city,date) result = db2.queryForList(sql, None) print(result) sql2 = "insert into quan_category values" batch = BatchSql(sql2) for item in result: quan_category = [] quan_category.extend([0,item[1],date,item[0],item[2],time,time]) print(quan_category) batch.addBatch(quan_category) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def deal_quan_monitor_activity_data(city,date,all_shopid_list): ''' 处理店圈店铺的活动信息 :param city: 所属城市 :param date: 数据日期 :param all_shopid_list: 店圈店铺的ID列表 :return: ''' sql=""" select t1.rest_id,t1.description,t1.active_type from t_e_rest_active_city_1712 t1 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s """%(city,date,all_shopid_list) sql2 = "insert into quan_monitor_activity_data values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for shoplist in all_shopid_list: all_activity = [] quan_monitor_activity_data = [] for item in result: activity = dict(type=item[2],content=item[1]) if item[0] == shoplist: all_activity.append(activity) all_activity = json.dumps(all_activity) quan_monitor_activity_data.extend([0,date,shoplist,str(all_activity),time,time]) print(quan_monitor_activity_data) batch.addBatch(quan_monitor_activity_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def deal_quan_shop(city,date,all_shopid_list,cate_list): ''' 处理店圈店铺的基本数据 :param city: 店圈所属城市 :param date: 数据分析日期 :param all_shopid_list: 店圈数据的店铺列表的数据 :param cate_list: 店圈数据的品类列表 :return: ''' sql = """ select t1.rest_id,t1.rest_name,t1.delivery_id,t1.longitude,t1.latitude,t1.is_new from t_e_rest_list_city_1712 t1 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s """%(city,date,all_shopid_list) sql2 = "insert into quan_shop values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for item in result: for cate in cate_list: if item[0] == cate[0]: quan_shop = [] quan_shop.extend([0,item[0],1,date,item[1],item[2],item[3],item[4],item[5],cate[1],time,time]) print(quan_shop) batch.addBatch(quan_shop) if batch.getSize() > 1000: db1.update(batch) batch.cleanBatch() db1.update(batch)
class ParseDeliveryMode(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.city = city self.date = date self.rest_area = rest_area self.db = db sql = "insert into t_e_delivery_mode_import values" self.batch1 = BatchSql(sql) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() def parse_data(self, data): # print(data) try: rest_id = str(data['param']) if len(rest_id) <= 11: data = data['data'].get('delivery_mode') delever_id = getMapValue(data, "id") delever_text = getMapValue(data, "text") param1 = [ self.city, self.date, self.rest_area, rest_id, delever_text, delever_id ] self.insert(self.batch1, param1) else: print("错误数据", data) except Exception as e: print(traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data))
def deal_quan_monitor_hot_sku_data(city,date,all_shopid_list): ''' 处理店圈sku数据,9个店铺大约110万条数据 :param city: 所属城市 :param date: 数据日期 :param all_shopid_list: 店圈店铺ID列表 :return: ''' sql=""" select t1.rest_id,t1.food_name,t1.price,t1.food_id,t1.food_month_sales,t1.has_activity,t2.delivery_fee from t_e_rest_menu_level2_unique_city_nj_1712 t1, t_e_rest_list_city_1712 t2 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s and t1.food_month_sales != 0 and t1.city = t2.city and t1.date = t2.date and t1.rest_id = t2.rest_id """%(city,date,all_shopid_list) sql2 = 'insert into quan_monitor_hot_sku_data values' batch = BatchSql(sql2) result = db2.queryForList(sql, None) print(result) for shop_list in all_shopid_list: for item in result: quan_monitor_hot_sku_data = [] if shop_list == item[0]: if item[5] == 1: #特价菜 quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],item[2],item[4],time,time]) print(quan_monitor_hot_sku_data) batch.addBatch(quan_monitor_hot_sku_data) elif item[5] == 0: #满减菜品 sub_price = get_sub_price(city,date,shop_list,item[2]) if sub_price == None: sub_price = 0 true_price = item[2]-sub_price quan_monitor_hot_sku_data.extend([0,date,shop_list,item[1],item[2],true_price,item[4],time,time]) print(quan_monitor_hot_sku_data) batch.addBatch(quan_monitor_hot_sku_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
def deal_quan_monitor_shop_data(city,date,all_shopid_list): ''' 处理店圈的配送费用和配送时间的信息 :param city: 店圈所属城市 :param date: 爬取数据的日期 :param all_shopid_list: 所有店圈的店铺列表信息 :return: ''' sql = """ select t1.rest_id,t1.order_month_sales,t2.deliver_time,t1.delivery_fee,t1.min_delivery_price,t2.overall_score from t_e_rest_list_city_1712 t1, t_e_rest_score_city_1712 t2 where t1.city = '%s' and t1.date = '%s' and t1.rest_id IN %s and t1.rest_id = t2.rest_id and t1.city = t2.city and t1.date = t2.date """%(city,date,all_shopid_list) sql2 = "insert into quan_monitor_shop_data values" batch = BatchSql(sql2) result = db2.queryForList(sql, None) for item in result: quan_monitor_shop_data = [] score = float('%.1f'%float(item[5])) quan_monitor_shop_data.extend([0,date,item[0],item[1],item[2],item[3],item[4],score,time,time]) print(quan_monitor_shop_data) batch.addBatch(quan_monitor_shop_data) if batch.getSize() > 10000: db1.update(batch) batch.cleanBatch() db1.update(batch)
class ParseHotword(object): def __init__(self, resource_path, city, date, save_date_short, err_rest_ids, db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param err_rest_ids: 错误店铺ID :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql = "insert into t_e_hot_search_word_city_%s values" % save_date_short self.batch = BatchSql(self.sql) def parse_data(self, data): try: rest_id = str(data["param"][0]) if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: la = data["param"][1] lo = data["param"][2] for item in data["data"]: search_word = getMapValue(item, "search_word") if search_word != '-999': self.batch.addBatch([ self.city, self.date, rest_id, la, lo, search_word ]) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) print('错误数据\n{}'.format(data)) except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'hot_word.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
class ParseCategoryObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_list_import values" sql2 = "insert into t_e_rest_active_import values" sql3 = "insert into t_e_rest_category_import values" sql4 = "insert into t_e_rest_open_time_import values" sql5 = "insert into t_e_rest_money_off_import values" sql6 = "insert into t_e_rest_money_off_avg_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) self.batch3 = BatchSql(sql3) self.batch4 = BatchSql(sql4) self.batch5 = BatchSql(sql5) self.batch6 = BatchSql(sql6) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() if self.batch5.getSize() > 0: self.db.update(self.batch5) self.batch5.cleanBatch() if self.batch6.getSize() > 0: self.db.update(self.batch6) self.batch6.cleanBatch() def parse_data(self, data): try: category_item = data['param'][2] category_level2_id = str(category_item.get("id2")) item = data["data"] rest_id = getMapValue(item, "id") lat = float(getMapValue(item, "latitude")) lng = float(getMapValue(item, "longitude")) if len(rest_id) > 11: print('错误店铺id\nrest_id: {}'.format(rest_id)) # elif not (self.polygonHellp.is_location_in_polygon(lng, lat)): # self.err_rest_ids.add(rest_id) # print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n' # 'address: {}'.format(rest_id, lat, lng, getMapValue(item, "name"), # getMapValue(item, "address"))) else: # 店铺去重 if not self.rest_category[rest_id]: param1 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(item, "name"), getMapValue(item, "phone", '/'), getMapValue(item, "address"), getMapValue(item, "average_cost"), [1, 0][getMapValue(item, "delivery_mode") == '-999'], getMapValue(item, "float_delivery_fee"), getMapValue(item, "float_minimum_order_amount"), [0, 1][getMapValue(item, "is_new") == 'True'], [0, 1][getMapValue(item, "is_premium") == 'True'], lat, lng, getMapValue(item, "recent_order_num"), '-999' ] self.insert(self.batch1, param1) opening_hours = getMapValue(item, "opening_hours") arr = json.loads(opening_hours.replace("\'", "\"")) if not isinstance(arr, list): print('错误数据:', rest_id, opening_hours) else: for s in arr: times = s.split("/") param4 = [ self.city, self.date, self.rest_area, rest_id, *times ] self.insert(self.batch4, param4) # TODO 平均满减折扣有问题 discount_rate_list = [] for active in item.get('activities'): attribute = getMapValue(active, "attribute") active_type = getMapValue(active, "icon_name") param2 = [ self.city, self.date, self.rest_area, rest_id, getMapValue(active, "description"), active_type ] self.insert(self.batch2, param2) if active_type == '减' and attribute != '-999': arr = json.loads(attribute) for key, value in arr.items(): if isinstance(value, dict): sub_price = value.get("1") if sub_price == 0: sub_price = value.get("0") else: sub_price = value key = round(float(key), 2) sub_price = round(float(sub_price), 2) discount_rate = round(sub_price / key, 4) discount_rate_list.append(discount_rate) param5 = [ self.city, self.date, self.rest_area, rest_id, key, sub_price, discount_rate ] self.insert(self.batch5, param5) # TODO 平均满减折扣有问题 if len(discount_rate_list): avg_discount_rate = sum(discount_rate_list) / len( discount_rate_list) avg_discount_rate = round(avg_discount_rate, 4) param6 = [ self.city, self.date, self.rest_area, rest_id, avg_discount_rate ] self.insert(self.batch6, param6) # 店铺对应品类去重 if category_level2_id not in self.rest_category[rest_id]: self.rest_category[rest_id].add(category_level2_id) category_level1_id = category_item.get('id1') category_level1_name = category_item.get('name') category_level2_name = category_item.get('name2') param3 = [ self.city, self.date, self.rest_area, category_level1_id, category_level1_name, category_level2_id, category_level2_name, rest_id, getMapValue(item, "name") ] self.insert(self.batch3, param3) except Exception: # print("解析数据报错", traceback.format_exc()) print('解析数据报错\n错误数据:{}'.format(data))
class ParseScore(object): def __init__(self,resource_path, city, date,save_date_short, err_rest_ids,db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param save_date_short:保存日期简写 :param err_rest_ids: 错误店铺ID :param db: 数据库配置文件 ''' self.resource_path = resource_path self.city = city self.date = date self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql = "insert into t_e_rest_score_city_%s values" %save_date_short self.batch = BatchSql(self.sql) def parse_data(self,data): try: rest_id = data['param'] if rest_id in self.err_rest_ids: return if len(rest_id) <= 11: self.batch.addBatch([self.city, self.date, rest_id, getMapValue(data['data'], "compare_rating"), getMapValue(data['data'], "deliver_time"), getMapValue(data['data'], "food_score"), getMapValue(data['data'], "overall_score"), getMapValue(data['data'], "service_score")]) if self.batch.getSize() > 100000: self.db.update(self.batch) self.batch.cleanBatch() else: print("错误数据", data) print('错误数据\n{}'.format(data)) except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'score.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while True: try: data = pickle.load(f) self.parse_data(data) except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch.getSize() > 0: self.db.update(self.batch) self.batch.cleanBatch()
class ParseMenu(object): def __init__(self, resource_path, city, date, city_short, save_date_short, err_rest_ids, db): ''' :param resource_path: 数据文件路径 :param city: 城市拼音 :param date: 保存日期 :param city_short: 城市简写 :param save_date_short: 保存日期简写 :param err_rest_ids: 错误城市id :param db: 数据库配置信息 ''' self.resource_path = resource_path self.city = city self.date = date self.city_short = city_short self.save_date_short = save_date_short self.err_rest_ids = err_rest_ids self.db = db self.sql1 = "insert into t_e_rest_menu_level1_city_%s values" % save_date_short self.sql2 = "insert into t_e_rest_menu_level2_city_%s_%s values" % ( city_short, save_date_short) self.sql3 = "insert into t_e_rest_menu_level2_unique_city_%s_%s values" % ( city_short, save_date_short) self.sql4 = "insert into t_e_rest_menu_discount_city_%s values" % save_date_short self.batch1 = BatchSql(self.sql1) self.batch2 = BatchSql(self.sql2) self.batch3 = BatchSql(self.sql3) self.batch4 = BatchSql(self.sql4) self.rest_food = defaultdict(set) def parse_data(self, data): try: rest_id = getMapValue(data, 'param') if rest_id in self.err_rest_ids: return for item in data["data"]: # 验证是否是错误数据 try: foods = item['foods'][::-1] except KeyError: print('错误数据\n{}'.format(item)) continue menu_name = getMapValue(item, 'name') self.batch1.addBatch( [self.city, self.date, rest_id, menu_name]) if self.batch1.getSize() > 100000: self.db.update(self.batch1) self.batch1.cleanBatch() for item2 in foods: food_id = getMapValue(item2, 'virtual_food_id') food_price_current = getMapValue(item2["specfoods"][0], 'price') food_price_primary = getMapValue(item2["specfoods"][0], 'original_price') has_activity = '0' if food_price_primary not in ['-999', '0']: has_activity = '1' self.batch2.addBatch([ self.city, self.date, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ]) if self.batch2.getSize() > 100000: self.db.update(self.batch2) self.batch2.cleanBatch() if food_id not in self.rest_food[rest_id]: self.rest_food[rest_id].add(food_id) self.batch3.addBatch([ self.city, self.date, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ]) if self.batch3.getSize() > 100000: self.db.update(self.batch3) self.batch3.cleanBatch() if food_price_primary not in ['-999', '0']: discount = '%.6f' % (float(food_price_current) / float(food_price_primary)) self.batch4.addBatch([ self.city, self.date, rest_id, food_id, food_price_primary, food_price_current, discount ]) if self.batch4.getSize() > 100000: self.db.update(self.batch4) self.batch4.cleanBatch() except Exception as e: print("解析数据报错", e) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): f_name = os.path.join(self.resource_path, 'menu.pickle') print("解析文件:", f_name) with open(f_name, "rb+") as f: while 1: try: data = pickle.load(f) self.parse_data(data) # break except EOFError as e: print("文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch()
class ParseMenuObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, err_rest_ids, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = err_rest_ids self.rest_food = defaultdict(set) self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_menu_level1_import values" sql2 = "insert into t_e_rest_menu_level2_import values" sql3 = "insert into t_e_rest_menu_level2_unique_import values" sql4 = "insert into t_e_rest_menu_discount_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) self.batch3 = BatchSql(sql3) self.batch4 = BatchSql(sql4) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self, batch, param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() def parse_data(self, data): try: rest_id = getMapValue(data, 'param') # TODO err_rest_ids需要传入 if rest_id in self.err_rest_ids: return for item in data["data"]: # 验证是否是错误数据 try: foods = item['foods'][::-1] except KeyError: print('错误数据\n{}'.format(item)) continue menu_name = getMapValue(item, 'name') param1 = [ self.city, self.date, self.rest_area, rest_id, menu_name ] self.insert(self.batch1, param1) for item2 in foods: food_id = getMapValue(item2, 'virtual_food_id') food_price_current = getMapValue(item2["specfoods"][0], 'price') food_price_primary = getMapValue(item2["specfoods"][0], 'original_price') has_activity = '0' if food_price_primary not in ['-999', '0']: has_activity = '1' param2 = [ self.city, self.date, self.rest_area, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ] self.insert(self.batch2, param2) if food_id not in self.rest_food[rest_id]: self.rest_food[rest_id].add(food_id) param3 = [ self.city, self.date, self.rest_area, rest_id, food_id, getMapValue(item2, 'name'), getMapValue(item2, 'month_sales'), getMapValue(item2, 'rating'), getMapValue(item2, 'satisfy_count'), food_price_current, has_activity, menu_name ] self.insert(self.batch3, param3) if food_price_primary not in ['-999', '0']: discount = '%.6f' % (float(food_price_current) / float(food_price_primary)) param4 = [ self.city, self.date, self.rest_area, rest_id, food_id, food_price_primary, food_price_current, discount ] self.insert(self.batch4, param4) except Exception as e: print("解析数据报错", traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data))
class ParseCategory(object): def __init__(self, resource_path, city, city_cn, date, date_short, db): ''' :param resource_path: 城市数据文件路径 :param city: 城市拼音 :param city_cn: 城市中文 :param date: 保存日期 :param date_short: 保存日期简写 :param db: 数据配置 :return: ''' self.resource_path = resource_path self.city = city self.city_cn = city_cn self.date = date self.date_short = date_short self.db = db self.sql1 = "insert into t_e_rest_list_city_pre_%s values" % date_short self.sql2 = "insert into t_e_rest_active_city_%s values" % date_short self.sql3 = "insert into t_e_rest_category_city_%s values" % date_short self.sql4 = "insert into t_e_rest_open_time_city_%s values" % date_short self.sql5 = "insert into t_e_rest_money_off_city_%s values" % date_short self.sql6 = "insert into t_e_rest_money_off_avg_city_%s values" % date_short self.batch1 = BatchSql(self.sql1) self.batch2 = BatchSql(self.sql2) self.batch3 = BatchSql(self.sql3) self.batch4 = BatchSql(self.sql4) self.batch5 = BatchSql(self.sql5) self.batch6 = BatchSql(self.sql6) self.err_rest_ids = set() self.rest_category = defaultdict(set) self.category_level1_list = [] self.category_level2_list = [] def get_data_by_level2_id(self, category_level2_id): category_level1_id, category_level1_name, category_level2_name = '-999', '-999', '-999' for item in self.category_level2_list: if category_level2_id == str(item.get('id2')): category_level2_name = str(item.get('name2')) category_level1_id = str(item.get('id1')) category_level1_name = str(item.get('name')) break # for item in category_level1_list: # if category_level1_id == str(item[0]): # category_level1_name = str(item[1]) # break return category_level1_id, category_level1_name, category_level2_id, category_level2_name def parse_data(self, data, m): ''' :param data: 城市数据 :param city: 城市拼音 :param date: 保存日期 :param m: 城市地图信息 :param date_short: 保存日期简写 :param db: 数据库配置信息 :return: ''' try: category_item = data['param'][2] category_level2_id = str(category_item.get("id2")) item = data["data"] rest_id = getMapValue(item, "id") lat = float(getMapValue(item, "latitude")) lng = float(getMapValue(item, "longitude")) if len(rest_id) > 11: print('错误店铺id\nrest_id: {}'.format(rest_id)) elif not (m.is_in_city(lng, lat)): self.err_rest_ids.add(rest_id) print('错误地址\nrest_id: {}, lat: {}, lng: {}, name: {}\n' 'address: {}'.format(rest_id, lat, lng, getMapValue(item, "name"), getMapValue(item, "address"))) else: # 店铺去重 if not self.rest_category[rest_id]: self.batch1.addBatch([ self.city, self.date, rest_id, getMapValue(item, "name"), getMapValue(item, "phone", '/'), getMapValue(item, "address"), getMapValue(item, "average_cost"), [1, 0][getMapValue(item, "delivery_mode") == '-999'], getMapValue(item, "float_delivery_fee"), getMapValue(item, "float_minimum_order_amount"), [0, 1][getMapValue(item, "is_new") == 'True'], [0, 1][getMapValue(item, "is_premium") == 'True'], lat, lng, getMapValue(item, "recent_order_num"), '-999' ]) if self.batch1.getSize() > 10000: self.db.update(self.batch1) self.batch1.cleanBatch() opening_hours = getMapValue(item, "opening_hours") arr = json.loads(opening_hours.replace("\'", "\"")) if not isinstance(arr, list): print('错误数据:', rest_id, opening_hours) print('错误数据\nrest_id: {}, opening_hours: {}'.format( rest_id, opening_hours)) else: for s in arr: times = s.split("/") self.batch4.addBatch( [self.city, self.date, rest_id, *times]) if self.batch4.getSize() > 100000: self.db.update(self.batch4) self.batch4.cleanBatch() # TODO 平均满减折扣有问题 discount_rate_list = [] for active in item.get('activities'): attribute = getMapValue(active, "attribute") active_type = getMapValue(active, "icon_name") self.batch2.addBatch([ self.city, self.date, rest_id, getMapValue(active, "description"), active_type ]) if self.batch2.getSize() > 100000: self.db.update(self.batch2) self.batch2.cleanBatch() if active_type == '减' and attribute != '-999': arr = json.loads(attribute) for key, value in arr.items(): if isinstance(value, dict): sub_price = value.get("1") if sub_price == 0: sub_price = value.get("0") else: sub_price = value key = round(float(key), 2) sub_price = round(float(sub_price), 2) discount_rate = round(sub_price / key, 4) discount_rate_list.append(discount_rate) self.batch5.addBatch([ self.city, self.date, rest_id, key, sub_price, discount_rate ]) if self.batch5.getSize() > 100000: self.db.update(self.batch5) self.batch5.cleanBatch() # TODO 平均满减折扣有问题 if len(discount_rate_list): avg_discount_rate = sum(discount_rate_list) / len( discount_rate_list) avg_discount_rate = round(avg_discount_rate, 4) self.batch6.addBatch( [self.city, self.date, rest_id, avg_discount_rate]) if self.batch6.getSize() > 100000: self.db.update(self.batch6) self.batch6.cleanBatch() # 店铺对应品类去重 if category_level2_id not in self.rest_category[rest_id]: self.rest_category[rest_id].add(category_level2_id) #TODO 取消了单独的品类文件,品类信息和店铺信息保存一起,可直接获取该店铺对应品类信息 # category_level1_id, category_level1_name, category_level2_id, category_level2_name = get_data_by_level2_id(category_level2_id) category_level1_id = category_item.get('id1') category_level1_name = category_item.get('name') category_level2_name = category_item.get('name2') self.batch3.addBatch([ self.city, self.date, category_level1_id, category_level1_name, category_level2_id, category_level2_name, rest_id, getMapValue(item, "name") ]) if self.batch3.getSize() > 100000: self.db.update(self.batch3) self.batch3.cleanBatch() except Exception as e: print("解析数据报错", traceback.format_exc()) print('解析数据报错\n错误数据:{}'.format(data)) def run(self): from ShopMonitor.elm.ElmWebParse.ElmMapUtil import ElmMapUtil m = ElmMapUtil(self.city_cn) f_name = os.path.join(self.resource_path, 'category.pickle') print("解析rest文件:", f_name) with open(f_name, "rb+") as f: while 1: try: data = pickle.load(f) self.parse_data(data, m) # parse_data(data, city, date,m,date_short,db) except EOFError as e: print("rest文件读取完成", e) break except Exception as e2: print("报错", e2) break if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() if self.batch3.getSize() > 0: self.db.update(self.batch3) self.batch3.cleanBatch() if self.batch4.getSize() > 0: self.db.update(self.batch4) self.batch4.cleanBatch() if self.batch5.getSize() > 0: self.db.update(self.batch5) self.batch5.cleanBatch() if self.batch6.getSize() > 0: self.db.update(self.batch6) self.batch6.cleanBatch() print('rest_list数:', len(self.rest_category.keys())) print('rest_category数:', sum([len(x) for x in self.rest_category.values()])) print(self.city_cn, "错误店铺数:", len(self.err_rest_ids)) # with open("err_rest_ids_" + city_cn + ".txt", "w") as f: # for item in err_rest_ids: # f.write(str(item) + '\n') return self.err_rest_ids
class ParseRatingObject(ParseDataObject): """ category.pickle数据解析 """ def __init__(self, resource_path, city, date, rest_area, db, is_test=False): ParseDataObject.__init__(self, resource_path, is_test) self.err_rest_ids = set() self.city = city self.date = date self.rest_area = rest_area self.db = db sql1 = "insert into t_e_rest_rating_import values" sql2 = "insert into t_e_rest_food_rating_import values" self.batch1 = BatchSql(sql1) self.batch2 = BatchSql(sql2) def parse(self, generator=None): generator = generator if generator else self.GeneratorPickleData() try: for data in generator: self.parse_data(data) self.insertAll() except: print("解析数据报错:", traceback.print_exc()) def insert(self,batch,param): batch.addBatch(param) if batch.getSize() > 100000: self.db.update(batch) batch.cleanBatch() def insertAll(self): if self.batch1.getSize() > 0: self.db.update(self.batch1) self.batch1.cleanBatch() if self.batch2.getSize() > 0: self.db.update(self.batch2) self.batch2.cleanBatch() def parse_data(self,data): print("data",data) try: rest_id = str(data["param"]) if len(rest_id) <= 11: item1 = data["data"] print('item1',item1) rating_id = str(uuid.uuid1()) print("rating_id",rating_id) rate_time = getMapValue(item1, "rated_at") rating_text = getMapValue(item1, "rating_text") reply_text = getMapValue(item1, "reply_text") rating_star = getMapValue(item1, "rating_star") username = getMapValue(item1, "username") param1 = [self.city, self.date,self.rest_area,rest_id,rating_id, rate_time, rating_text,reply_text,rating_star,username] self.insert(self.batch1,param1) for item2 in item1["item_ratings"]: food_id = getMapValue(item2, "food_id") food_name = getMapValue(item2,"food_name") image_hash = getMapValue(item2, "image_hash") param2 = [self.city, self.date,self.rest_area, rest_id, rating_id,rate_time, food_id,image_hash,food_name,rating_star,rating_text,reply_text] self.insert(self.batch2, param2) else: print("错误数据", data) except Exception as e: print(traceback.print_exc()) print('解析数据报错\n错误数据:{}'.format(data))