def createLocalWCDict(trainFile, min_count_w=2, min_count_c=2, global_dict_path="data/atec/training-2-2.json"): """ 根据训练数据的不同生成不同的动态序号-词/字、词/字-序号字典 """ global_dict = loadDict(global_dict_path) global_w_v2i = global_dict["word"]["v2i"] global_c_v2i = global_dict["char"]["v2i"] data = read_cut_file(trainFile, with_label=True) sentences = data["sent1w"] + data["sent2w"] sentences_c = data["sent1c"] + data["sent2c"] savePath = os.path.join(os.path.dirname(trainFile), os.path.basename(trainFile).split(".")[0]) words, chars = {}, {} for sentence in sentences: for word in sentence: try: words[word] += 1 except: words[word] = 1 for sentence in sentences_c: for char in sentence: try: chars[char] += 1 except: chars[char] = 1 print("Size for text words: ", len(words.keys())) print("Size for global words: ", len(global_w_v2i.keys())) print("Size for text chars: ", len(chars.keys())) print("Size for global chars: ", len(global_c_v2i.keys())) vocab = [w for w in words.keys() if words[w] >= min_count_w] vocab_c = [c for c in chars.keys() if chars[c] >= min_count_c] vocab = ['<pad>'] + ['<unk>'] + vocab vocab_c = ['<pad>'] + ['<unk>'] + vocab_c v2i, i2v = {}, {} for word in vocab: id = global_w_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ", v2i['<pad>']) print("id for <unk>: ", v2i['<unk>']) print("total vocab size: ", len(v2i.keys())) w_dict = {"v2i": v2i, "i2v": i2v} v2i, i2v = {}, {} for word in vocab_c: id = global_c_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ", v2i['<pad>']) print("id for <unk>: ", v2i['<unk>']) print("total vocab size: ", len(v2i.keys())) c_dict = {"v2i": v2i, "i2v": i2v} d = {"word": w_dict, "char": c_dict} saveDict( d, savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json") return d
id = global_w_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ",v2i['<pad>']) print("id for <unk>: ",v2i['<unk>']) print("total vocab size: ",len(v2i.keys())) w_dict={"v2i":v2i,"i2v":i2v} v2i,i2v = {},{} for word in vocab_c: id = global_c_v2i[word] v2i[word] = id i2v[id] = word print("id for <pad>: ",v2i['<pad>']) print("id for <unk>: ",v2i['<unk>']) print("total vocab size: ",len(v2i.keys())) c_dict={"v2i":v2i,"i2v":i2v} d={"word":w_dict,"char":c_dict} saveDict(d, savePath + "-" + str(min_count_w) + "-" + str(min_count_c) + ".json") return d if __name__=="__main__": # createGlobalWCDict("../data/atec/training.csv",2,2) # createLocalWCDict("../data/atec/10/train0.csv",2,2,"../data/atec/training-2-2.json") global_dict=loadDict("../data/atec/training-2-2.json") c_len={} for w in global_dict["word"]["v2i"].keys(): c_len[w]=len(w) print(sorted(c_len.items(),key=lambda x:x[1],reverse=True)) # 9、7、6、6、6、6、5
def load_global_embedding_matrix(wv_path_w, wv_path_c, global_dict_path="data/atec/training-2-2.json" ): wv_path_list = [wv_path_w, wv_path_c] try: wv_type_w = wv_path_w.split("/")[-2] except AttributeError: wv_type_w = None assert wv_type_w in ["glove", "word2vec", "fasttext", None] try: wv_type_c = wv_path_c.split("/")[-2] except AttributeError: wv_type_c = None assert wv_type_c in ["glove", "word2vec", "fasttext", None] wv_type_list = [wv_type_w, wv_type_c] try: wv_name_w = wv_path_w.split("/")[-1].split(".")[0] except AttributeError: wv_name_w = None try: wv_name_c = wv_path_c.split("/")[-1].split(".")[0] except AttributeError: wv_name_c = None wv_name_list = [wv_name_w, wv_name_c] embeddings_list = [] for i, wv_level in enumerate(["word", "char"]): wv_path = wv_path_list[i] if isinstance(wv_path, str): wv_type = wv_type_list[i] assert wv_type in ["glove", "word2vec", "fasttext"] wv_name = wv_name_list[i].replace("wc", wv_level) embed_path = global_dict_path.replace( ".json", "_" + wv_type + '_' + wv_name + ".npy") embed_path_oov = global_dict_path.replace( ".json", "_" + wv_type + '_' + wv_name + ".oov.npy") if os.path.exists(embed_path): embeddings = np.load(embed_path) oov_mask = np.load(embed_path_oov) assert embeddings.shape[0] == oov_mask.shape[0] else: oov_mask = [] i2v = loadDict(global_dict_path)[wv_level]["i2v"] vocab_size = len(i2v) embedding_size = int(wv_name.split("-")[-1]) embeddings = np.random.uniform(low=-0.1, high=0.1, size=(vocab_size, embedding_size)) if wv_type == "word2vec": model = models.Word2Vec.load(wv_path) else: model = models.KeyedVectors.load_word2vec_format( wv_path, binary=False) n_oov = 0 for i in range(vocab_size): word = i2v[str(i)] try: embeddings[i] = model[word] oov_mask.append(0) except: n_oov += 1 print("Not in wv: id: %d, vocab: %s" % (i, word)) oov_mask.append(1) print("Size for oov: %d!" % n_oov) np.save(embed_path, embeddings) oov_mask = np.asarray(oov_mask, dtype=np.int).reshape( (vocab_size, 1)) np.save(embed_path_oov, oov_mask) embeddings_list.append((embeddings, oov_mask)) elif isinstance(wv_path, int): i2v = loadDict(global_dict_path)[wv_level]["i2v"] vocab_size = len(i2v) embedding_size = int(wv_path) embeddings = np.random.uniform(low=-0.1, high=0.1, size=(vocab_size, embedding_size)) oov_mask = [0] * vocab_size oov_mask = np.asarray(oov_mask, dtype=np.int).reshape( (vocab_size, 1)) embeddings_list.append((embeddings, oov_mask)) else: print("Unsupported type for wv_path!") return embeddings_list[0], embeddings_list[1]