def build_embedding(embed_file, targ_vocab, wv_dim): vocab_size = len(targ_vocab) emb = np.random.uniform(low=-1, high=1, size=(vocab_size, wv_dim)) # 随机编码的所有维度为-1~1之间的等概率分布 emb[0] = 0 # 0号单词<PAD>的单词编码为全零 w2id = {w: i for i, w in enumerate(targ_vocab)} lineCnt = 0 with open(file=embed_file, encoding='utf-8') as f: # 读入GloVe编码文件 for line in f: lineCnt = lineCnt + 1 if lineCnt % 100000 == 0: print('.', end='', flush=True) elems = line.split() token = normalize_text(''.join(elems[0: -wv_dim])) # 文件每一列最后300列是编码,之前是单词字符串 if token in w2id: # 如果是词表中的单词,则将其编码特换为GloVe编码 emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]] return emb
def build_embedding(embed_file, targ_vocab, wv_dim): vocab_size = len(targ_vocab) emb = np.random.uniform(-1, 1, (vocab_size, wv_dim)) emb[0] = 0 # <PAD> should be all 0 (using broadcast) w2id = {w: i for i, w in enumerate(targ_vocab)} lineCnt = 0 with open(embed_file, encoding="utf8") as f: for line in f: lineCnt = lineCnt + 1 if lineCnt % 100000 == 0: print('.', end='', flush=True) elems = line.split() token = normalize_text(''.join(elems[0:-wv_dim])) if token in w2id: emb[w2id[token]] = [float(v) for v in elems[-wv_dim:]] return emb