def brand_reg(self, s_name, brand_lst, brand_id): r_lst = [] for ext_bname in brand_lst: if tool.is_all_eng(ext_bname) and tool.is_own_eng(s_name) and ext_bname in s_name: en_reg_bname = self.english_brand_recognition(ext_bname, s_name) if en_reg_bname != None: reg_bname = en_reg_bname else: continue elif ext_bname in s_name: reg_bname = ext_bname else: continue r_lst.append((reg_bname, brand_id)) #rule_opt_lst = self.rule_opt(s_name, r_lst) #return rule_opt_lst return r_lst
def english_brand_extension(self, brand_name): """ target: 将扩展的品牌直接保存值召回品牌中 1)指定品牌 2)标准品牌 第一种情况:去特殊字符 A.H.C/爱和纯 -> AHC爱和纯 -> A.H.C/爱和纯/AHC爱和纯 A.O.史密斯 -> AO史密斯 -> A.O.史密斯/AO史密斯 第二种情况:去英文的空格 MAKE UP FOR EVER -> MAKEUPFOREVER COLOR KEY -> COLORKEY a b c/某某某 -> abc/a b c/某某某/abc某某某 :return: """ def _single_brand_ext(tmp_b_name): # 去除空格 b1 = re.sub(r"[\s]+", "", tmp_b_name) # 去除. b2 = tmp_b_name.replace(".", "").replace(".", "") r_lst = list(set([tmp_b_name, b1, b2])) return r_lst # 10943455 Hisense/海信(黑电) ok_brand_name = "" tmp = brand_name.strip().replace("(", "(").replace(")", "").replace(")", "") lst2 = tmp.split("(") if len(lst2) == 2: b1 = lst2[0] if tool.is_all_eng(lst2[1]): b2 = lst2[1] ok_brand_name = b2 + "/" + b1 else: ok_brand_name = b1 else: ok_brand_name = brand_name brand_lst = ok_brand_name.strip().split("/") re_brand_lst = [] if len(brand_lst) == 1: re_brand_lst += _single_brand_ext(brand_lst[0]) else: en_brand_lst = [] ch_brand_lst = [] other_brand_lst = [] for b in brand_lst: if tool.is_all_eng(b): en_brand_lst.append(b) elif tool.is_all_chinese(b): ch_brand_lst.append(b) else: other_brand_lst.append(b) en_brand_ext_lst = [] for z in en_brand_lst: en_brand_ext_lst += _single_brand_ext(z) mix_brand_lst = [] for y in en_brand_ext_lst: for x in ch_brand_lst: mix_brand_lst.append(y + x) mix_brand_lst.append(x + y) if len(en_brand_lst) > 1: for i in range(len(en_brand_ext_lst)): for j in range(i + 1, len(en_brand_lst)): mix_brand_lst.append(en_brand_lst[i] + en_brand_lst[j]) mix_brand_lst.append(en_brand_lst[j] + en_brand_lst[i]) re_brand_lst = mix_brand_lst + en_brand_ext_lst + ch_brand_lst + other_brand_lst re_brand_lst = list(set(re_brand_lst)) #print(re_brand_lst) return "/".join(re_brand_lst)
def brand_info_loading(self): cat1_brand_dict = {} # 一级类下包含哪些品牌 cat1_clean_brand_dict = {} cat1_dict = {} brand_cat1_dict = {} # {brand_id: [cat1, cat2]} brand_idx_dict = {} # {"苹果": [1, 2]} idx_ori_brand_dict = {} # name_ori_brand_dict = {} # 品牌名称原始字符串 brand_gmv_dict = {} # 品牌的gmv idx_brand_lst_dict = { } # 处理后的品牌,比如:[1 Apple/苹果] -》{'1': ['apple', '苹果']} idx = 0 with open(self._brand_info_file, "r", encoding="utf-8") as f1: for line in f1: line = line.strip() if line == "": continue if line.startswith("#"): continue # brand_id, brand_name, cat1_id, cat1, gmv lst1 = line.split("\t") if len(lst1) != 6: continue lst1 = [tmp.strip() for tmp in lst1] b_id, b_name_ori, b_name, cat1_id, cat1, gmv = lst1 # idx-brand idx_ori_brand_dict[b_id] = b_name if b_id == '10698337': name_ori_brand_dict[b_id] = 'Xiaomi/小米' else: name_ori_brand_dict[b_id] = b_name_ori cat1_dict[cat1_id] = cat1 if b_name in self._exchange_brand_dict: b_name = self._exchange_brand_dict[b_name] r_brand_set = tool.brand_dealing(b_name) brand_gmv_dict[b_id] = round(float(gmv), 3) idx_brand_lst_dict[b_id] = list(r_brand_set) # brand-idx for r in r_brand_set: if len(r) == 1: continue if tool.is_number(r): continue is_eng = tool.is_all_eng(r) if is_eng and len(r) < 3: continue # 需要删除的品牌 if r in self._del_brand_dict: continue flag = "0" if is_eng else "1" r = "%s|%s" % (r, flag) if r in brand_idx_dict: z = brand_idx_dict[r] z = [b_id] + z z = list(set(z)) brand_idx_dict[r] = z else: brand_idx_dict[r] = [b_id] if cat1_id in cat1_clean_brand_dict: p = cat1_clean_brand_dict[cat1_id] cat1_clean_brand_dict[cat1_id] = p + [r] else: cat1_clean_brand_dict[cat1_id] = [r] # mkt2-brand if cat1 != "NULL": if b_id in brand_cat1_dict: xx = brand_cat1_dict[b_id] brand_cat1_dict[b_id] = xx + [cat1_id] else: brand_cat1_dict[b_id] = [cat1_id] if cat1_id in cat1_brand_dict: lst_9 = cat1_brand_dict[cat1_id] cat1_brand_dict[cat1_id] = ["%s|%s" % (b_id, b_name)] + lst_9 else: cat1_brand_dict[cat1_id] = ["%s|%s" % (b_id, b_name)] else: continue return brand_idx_dict, idx_ori_brand_dict, name_ori_brand_dict,\ brand_cat1_dict, cat1_brand_dict, \ cat1_clean_brand_dict, brand_gmv_dict, cat1_dict
def _is_all_eng(self, s1): return tool.is_all_eng(s1)