def params_init(self): """超参数初始化""" # params path_params = os.path.join(self.path_dir, "params.json") self.params = load_json(path_params) self.len_max = self.params["len_max"] self.level_type = self.params["level_type"]
def load_pinyin_dict(self): """ 加载默认的拼音pinyin字典 :return: None """ dict_pinyin = load_json(path_dict_pinyin)[0] # 加载json字典文件 for k, v in dict_pinyin.items(): self.dict_pinyin[k] = v
def load_macropodus_dict(self): """ 加载默认的基础字典 :return: None """ dict_macropodus = load_json(path_dict_macropodus)[ 0] # (path_dict_jiagu)[0] # (path_dict_macropodus)[0] # 加载json字典文件 dict_macropodus_def = defaultdict() # 转为defaultdict for k, v in dict_macropodus.items(): dict_macropodus_def[k] = v self.dict_words_freq = dict_macropodus_def # {}词-词频字典
def load_user_dict(self, path_user=path_dict_user, type_user="******"): """ 加载用户词典 :param path_user:str, like '/home/user.dict' :return: None """ if not os.path.exists(path_user): raise RuntimeError("your path_user is not exist!") if type_user == "json": self.dict_user = load_json(path_user)[0] # 加载json字典文件 for k, v in self.dict_user.items(): if k not in self.dict_words_freq: self.dict_words_freq[k] = v # 更新到总字典, words_freq else: self.dict_words_freq[ k] = self.dict_words_freq[k] + v # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) elif type_user == "txt": words_all = txt_read(path_user) for word_freq in words_all: wf = word_freq.split(" ") # 空格' '区分带不带词频的情况 if len(wf) == 2: word = wf[0] freq = wf[1] else: word = wf[0] freq = 132 if word not in self.dict_words_freq: self.dict_words_freq[word] = freq # 更新到总字典, words_freq else: self.dict_words_freq[word] = self.dict_words_freq[ word] + freq # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) elif type_user == "csv": words_all = txt_read(path_user) for word_freq in words_all: wf = word_freq.split(",") # 逗号','区分带不带词频的情况 if len(wf) == 2: word = wf[0] freq = wf[1] else: word = wf[0] freq = 132 if word not in self.dict_words_freq: self.dict_words_freq[word] = freq # 更新到总字典, words_freq else: self.dict_words_freq[word] = self.dict_words_freq[ word] + freq # 更新到总字典, words_freq self.num_words = sum(self.dict_words_freq.values()) else: raise EOFError
# 加载模型权重 model.load_weights(path_dir + "/model.h5") # reader tokenizer token_dict = {} path_dict = os.path.join(path_model_dir, "vocab.txt") with codecs.open(path_dict, 'r', 'utf8') as reader: for line in reader: token = line.strip() token_dict = json.loads(token) vocab_size = len(token_dict) tokenizer = Tokenizer(token_dict) # params path_params = path_dir + "/params.json" params = load_json(path_params) len_max = params["len_max"] # l2i_i2l path_l2i_i2l = path_dir + "/l2i_i2l.json" l2i_i2l = load_json(path_l2i_i2l) def sentence2idx(text): text = extract_chinese(str(text).upper()) text = list(text) text = [text_one for text_one in text] len_leave = len_max - len(text) if len_leave >= 0: text_index = [ token_dict[text_char] if text_char in token_dict else token_dict['[UNK]']
def preprocess_label_question_to_idx_fit(self, embedding_type, path, embed, rate=1, crf_mode='reg'): """ fit用, 关键:对每一条数据操作,获取label和问句index :param embedding_type: str, like 'albert' :param path: str, like 'train.json' :param embed: class, like embed :param rate: float, like 0.9 :param crf_mode: str, like 'reg', 'pad' :return: np.array """ # 首先获取label,set,即存在的具体类 label_set, len_all = self.preprocess_label2set(path, embedding_type) # 获取label转index字典等, 如果label2index存在则不转换了, dev验证集合的时候用 if not os.path.exists(self.path_model_l2i_i2l): count = 0 label2index = {} index2label = {} for label_one in label_set: label2index[label_one] = count index2label[count] = label_one count = count + 1 l2i_i2l = {} l2i_i2l['l2i'] = label2index l2i_i2l['i2l'] = index2label save_json(l2i_i2l, self.path_model_l2i_i2l) else: l2i_i2l = load_json(self.path_model_l2i_i2l) # 读取数据的比例 len_ql = int(rate * len_all) if len_ql <= 500: # sample时候不生效,使得语料足够训练 len_ql = len_all def process_line(line, embed, l2i_i2l): """ 对每一条数据操作,获取label和问句index :param line: :param embed: :param l2i_i2l: :return: """ # 对每一条数据操作,对question和label进行padding ques_label = json.loads(line.strip()) label_org = ques_label["label"] label_index = [l2i_i2l["l2i"][lr] for lr in label_org] # len_sequence = len(label_index) que_embed = embed.sentence2idx("".join(ques_label["question"])) # label padding if embedding_type in ['bert', 'albert']: # padding label len_leave = embed.len_max - len(label_index) - 2 if len_leave >= 0: label_index_leave = [l2i_i2l["l2i"]["<CLS>"]] + [ li for li in label_index ] + [l2i_i2l["l2i"]["<PAD>"] for _ in range(len_leave) ] + [l2i_i2l["l2i"]["<SEP>"]] else: label_index_leave = [ l2i_i2l["l2i"]["<CLS>"] ] + label_index[0:embed.len_max - 2] + [l2i_i2l["l2i"]["<SEP>"]] else: # padding label len_leave = embed.len_max - len(label_index) # -2 if len_leave >= 0: label_index_leave = [li for li in label_index] + [ l2i_i2l["l2i"]["<PAD>"] for i in range(len_leave) ] else: label_index_leave = label_index[0:embed.len_max] # 转为one-hot label_res = to_categorical(label_index_leave, num_classes=len(l2i_i2l["l2i"])) return que_embed, label_res file_csv = open(path, "r", encoding="utf-8") cout_all_line = 0 cnt = 0 x, y = [], [] for line in file_csv: # 跳出循环 if len_ql < cout_all_line: break cout_all_line += 1 if line.strip(): # 一个json一个json处理 # 备注:最好训练前先处理,使得ques长度小于等于len_max(word2vec), len_max-2(bert, albert) x_line, y_line = process_line(line, embed, l2i_i2l) x.append(x_line) y.append(y_line.tolist()) cnt += 1 # 通过两种方式处理: 1.嵌入类型(bert, word2vec, random), 2.条件随机场(CRF:'pad', 'reg')类型 if embedding_type in ['bert', 'albert']: x_, y_ = np.array(x), np.array(y) x_1 = np.array([x[0] for x in x_]) x_2 = np.array([x[1] for x in x_]) x_3 = np.array([x[2] for x in x_]) if crf_mode == 'pad': x_all = [x_1, x_2, x_3] elif crf_mode == 'reg': x_all = [x_1, x_2] else: x_all = [x_1, x_2] else: x_, y_ = np.array(x), np.array(y) x_1 = np.array([x[0] for x in x_]) x_2 = np.array([x[1] for x in x_]) if crf_mode == 'pad': x_all = [x_1, x_2] elif crf_mode == 'reg': x_all = x_1 else: x_all = x_1 # 使用fit的时候, return返回 return x_all, y_
def __init__(self, path_model_l2i_i2l): self.path_model_l2i_i2l = path_model_l2i_i2l self.l2i_i2l = None if os.path.exists(self.path_model_l2i_i2l): self.l2i_i2l = load_json(self.path_model_l2i_i2l)
def l2i_i2l_init(self): """类别与数字项目转化""" # l2i_i2l path_l2i_i2l = os.path.join(self.path_dir, "l2i_i2l.json") self.l2i_i2l = load_json(path_l2i_i2l)
def tokenizer_init(self): """字典""" # reader tokenizer self.token2idx = {} path_dict = os.path.join(self.path_dir, "vocab.txt") self.token2idx = load_json(path_dict)