def get_chaos_menu(): sql = "select menu from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_waimai') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] bd_parser = BdWmParser() mt_parser = MtWmParser() elm_parser = ElemeParser() menu_merger_obj = MenuMerger() tb_dic = {'eleme':'eleme_shop', 'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop'} parser_dic = {'eleme':elm_parser, 'baidu':bd_parser, 'meituan':mt_parser} ss = u'𠧧鸭腿' num = 0 for line in open('../data/res.out_json_offline', 'r'): num += 1 if num % 2000 == 0: print (num) dic = json.loads(line.strip()) tup_ls = [] for tag, _id in dic.items(): tb_name = tb_dic[tag] cursor.execute(sql.format(tb_name, _id)) res = cursor.fetchone() if res: parser = parser_dic[tag] menu = res['menu'] ori_ls = parser.parse_one_menu(menu) name_ls = parser.get_all_food(ori_ls) name_ls = set(name_ls) if ss in name_ls: print (tag, _id) sys.exit()
def parse_specific_menu(): bd_parser = BdWmParser() mt_parser = MtWmParser() elm_parser = ElemeParser() src_tb_dic = {'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop', 'eleme':'eleme_shop'} tb_parser_dic = {'baidu_waimai_shop':bd_parser, 'meituan_waimai_shop':mt_parser, 'eleme_shop':elm_parser} mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] for line in sys.stdin: ln = line.strip() dic = json.loads(ln) for src, _id in dic.items(): src_tb = src_tb_dic[src] parser = tb_parser_dic[src_tb] sql = "select menu from `{}` where id='{}' limit 1" sql = sql.format(src_tb, _id) cursor.execute(sql) dic = cursor.fetchone() menu = dic.get('menu', '') menu_ls = parser.parse_one_menu(menu) print (json.dumps(menu_ls, ensure_ascii=False).encode('utf8')) sys.exit()
def menu_cluster(fn='merged_shop.json'): ''' 进行菜品的聚类 input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''} output: 打印结果 json 每行为一个店铺的菜品合并信息 ''' sql = "select * from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] feat_gen = FeatureGenerator() cluster_obj = SimpleCluster() tb_dic = { 'eleme': 'eleme_shop', 'baidu': 'baidu_waimai_shop', 'meituan': 'meituan_waimai_shop' } num = 0 for line in open(fn, 'r'): dic = json.loads(line.strip()) feat_ls = [] for tag, _id in dic.items(): if tag not in tb_dic: continue tb_name = tb_dic[tag] cursor.execute(sql.format(tb_name, _id)) res = cursor.fetchone() __feat_ls = feat_gen.generate_feature_with_food_dic(res, tag) # print ('__feat_ls', len(__feat_ls)) feat_ls.extend(__feat_ls) # print (len(feat_ls), feat_ls) label_ls = cluster_obj.cluster(feat_ls) res_dic = OrderedDict() for __feat_ls, label in zip(feat_ls, label_ls): if label not in res_dic: res_dic[label] = [] src, _id, food_dic, food_name = __feat_ls[:4] food_dic['__source'] = src food_dic['__id'] = _id res_dic[label].append(food_dic) res_dic = {'id': dic['id'], 'foods': res_dic} print(json.dumps(res_dic, ensure_ascii=False).encode('utf8'))
def menu_cluster(fn = '../menu_fusion/res_dic.json'): ''' 进行菜品的聚类 input: fn string 合并好的文件名 一行一个合并的店铺 {"baidu":'', "eleme":'', "meituan":'', "id":''} output: 打印结果 json 每行为一个店铺的菜品合并信息 ''' sql = "select * from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] feat_gen = FeatureGenerator() cluster_obj = SimpleCluster() tb_dic = {'eleme':'eleme_shop', 'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop'} num = 0 for line in open(, 'r'): dic = json.loads(line.strip()) feat_ls = [] for tag, _id in dic.items(): tb_name = tb_dic[tag] cursor.execute(sql.format(tb_name, _id)) res = cursor.fetchone() __feat_ls = feat_gen.generate_feature_with_food_dic(res, tb_name) # print ('__feat_ls', len(__feat_ls)) feat_ls.extend(__feat_ls) # print (len(feat_ls), feat_ls) label_ls = cluster_obj.cluster(feat_ls) out_ls = [] for __feat_ls, label in zip(feat_ls, label_ls): __feat_ls.append(str(label)) _feat_ls = [] for ss in __feat_ls: if not isinstance(ss, str): ss = ss.encode('utf8') _feat_ls.append(ss) out_ls.append(_feat_ls) out_ls = sorted(out_ls, key=lambda x:x[-1]) for _feat_ls in out_ls: print ('\t'.join(_feat_ls)) print '' num += 1 if num == 10: break
def get_unlabeled_data(): ''' input: sys.stdin output: sys.stdout \t split id shop_name food_name mid eid bid ''' sql = "select * from `{}` where id='{}'" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_online') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] bd_parser = BdWmParser() mt_parser = MtWmParser() elm_parser = ElemeParser() tb_dic = {'eleme':'eleme_shop', 'baidu':'baidu_waimai_shop', 'meituan':'meituan_waimai_shop'} tb_ls = ['meituan_waimai_shop', 'eleme_shop', 'baidu_waimai_shop'] parser_ls = [mt_parser, elm_parser, bd_parser] id_set = set() for line in sys.stdin: ls = line.strip().split('\t') try: _id, cnt, name, typ, city, m_id, e_id, b_id = ls except: continue if _id in id_set: continue id_set.add(_id) for a_id, tb, parser in zip([m_id, e_id, b_id], tb_ls, parser_ls): if not a_id: continue sql_i = sql.format(tb, a_id) cursor.execute(sql_i) dic = cursor.fetchone() food_ls = parser.get_all_food_from_menu(dic.get('menu', '')) for food in food_ls: str_ls = [_id, name, typ, city, food.encode('utf8'), a_id, tb] #str_ls = [ss.encode('utf8') for ss in str_ls] print ('\t'.join(str_ls)) print ('') if len(id_set) >=50: break
def get_sample_shop_from_sample_file(): ''' 通过采样文件获取同一商家在不同app中的id 输入: sys.stdin <<<<<<< HEAD 720e49b2f4c6991ff4b3b6500fd815ba 1 印象柳螺柳州螺蛳粉•匠心制造 waimai meituan_id cnt brand_name type 输出: sys.stdout meituan_id cnt brand_name meituan_id eleme_id baidu_id ======= 720e49b2f4c6991ff4b3b6500fd815ba 1 印象柳螺柳州螺蛳粉•匠心制造 waimai meituan_id cnt brand_name type 输出: sys.stdout meituan_id cnt brand_name meituan_id eleme_id baidu_id >>>>>>> 4cca5f0e4264c66366b590f7a182ce39f800a3b1 ''' sql = "select * from `std_shop` where id='{}' limit 1" mysql_obj = get_mysql_obj(os.sep.join([conf_dir, 'db.conf']), 'mysql_waimai') conn = mysql_obj['conn'] cursor = mysql_obj['cursor'] for line in sys.stdin: