def load_embedding_model(args, vocab): embedding_model = Embedding(vocab.size(), args.input_dim) if args.cuda: embedding_model = embedding_model.cuda() emb_file = os.path.join(args.data, args.emb_dir.split("/")[-1]+"_"+args.emb_file + '_emb.pth') if os.path.isfile(emb_file) and torch.load(emb_file).size()[1] == args.input_dim: emb = torch.load(emb_file) else: glove_vocab, glove_emb = load_word_vectors(os.path.join(args.emb_dir,args.emb_file)) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) not_known = [] for word in vocab.token_to_idx.keys(): if glove_vocab.get_index(word): emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index(word)] else: not_known.append(word) emb[vocab.get_index(word)] = torch.Tensor(emb[vocab.get_index(word)].size()).normal_(-0.05, 0.05) if args.calculate_new_words: emb = apply_not_known_words(emb, args, not_known, vocab) torch.save(emb, emb_file) if args.cuda: emb = emb.cuda() # plug these into embedding matrix inside model embedding_model.state_dict()['weight'].copy_(emb) return embedding_model
def prepare_embeddings(vectors, word2index): embedding_model = Embedding(len(word2index), config.tree_config["input_dim"]) if config.tree_config["cuda"]: embedding_model = embedding_model.cuda() torch_vectors = torch.tensor(vectors) if config.tree_config["cuda"]: torch_vectors = torch_vectors.cuda() embedding_model.state_dict()['weight'].copy_(torch_vectors) return embedding_model
def load_embedding_model(data: str, emb_dir: str, emb_file: str, input_dim: int, vocab: Vocab, cuda: bool = False): new_emb_file = os.path.join( data, emb_dir.split("/")[-1] + "_" + emb_file + '_emb.pth') if os.path.isfile(new_emb_file) and torch.load( new_emb_file).size()[1] == input_dim: emb: Tensor = torch.load(new_emb_file) else: glove_vocab, glove_emb = load_word_vectors( os.path.join(emb_dir, emb_file)) print('==> GLOVE vocabulary size: %d ' % glove_vocab.size()) emb = torch.zeros(vocab.size(), glove_emb.size(1)) not_known = [] for word in vocab.token_to_idx.keys(): if glove_vocab.get_index(word): emb[vocab.get_index(word)] = glove_emb[glove_vocab.get_index( word)] else: not_known.append(word) emb[vocab.get_index(word)] = torch.Tensor( emb[vocab.get_index(word)].size()).normal_(-0.05, 0.05) torch.save(emb, emb_file) embedding_model = Embedding(emb.shape[0], input_dim) if cuda: emb = emb.cuda() embedding_model = embedding_model.cuda() # plug these into embedding matrix inside model embedding_model.state_dict()['weight'].copy_(emb) return embedding_model
def main(vocab_dir, embed_path): with open(embed_path, 'rb') as fread: vocab_size, embed_size = map(int, fread.readline().strip().split()) embed = np.zeros((vocab_size + 2, embed_size)) # <unk>, <pad> added embed_stoi = {} binary_len = np.dtype('float32').itemsize * embed_size for i in range(vocab_size): word = [] while True: ch = fread.read(1) if ch == b' ': word = b''.join(word).decode('utf-8') break if ch != b'\n': word.append(ch) embed_stoi[word] = i + 2 embed[i + 2] = np.fromstring(fread.read(binary_len), dtype='float32') vocab = Vocab(Counter(list(embed_stoi)), specials=['<unk>', '<pad>'], specials_first=True) embed_torch = Embedding(vocab_size + 2, embed_size) for idx, key in enumerate(vocab.itos): if embed_stoi.get(key, 0) != 0: embed_torch.weight[idx] = torch.from_numpy(embed[embed_stoi[key]]) else: embed_torch.weight[idx] = torch.from_numpy(np.zeros(embed_size)) with open(os.path.join(vocab_dir, 'words'), 'wb') as fwrite: pickle.dump(vocab, fwrite) torch.save(embed_torch.state_dict(), os.path.join(vocab_dir, 'embed.pt'))
loss.backward() optim.step() model_optim.step() emb_optim.step() if i % 100 == 0: stats = open('stats.txt', 'a') eval_performance(stats, epoch, i) stats.close() if i % 1000 == 0: print('Saving') with open('lstm_lstm.pickle', 'wb') as f: pickle.dump(lstm, f) with open('lstm_lstm.model', 'wb') as f: pickle.dump(model, f) torch.save(emb.state_dict(), 'emb_post_lstm.txt') #with open('emb_post_lstm.pickle', 'wb') as f: # pickle.dump(emb, f) i += 1 with open('lstm_lstm.pickle', 'wb') as f: pickle.dump(lstm, f) stats.close() #with open('lstm_sequential.pickle', 'wb') as f: # pickle.dump(model, f) # for a in acts: # spin, _, timestamp, words = a # loss = None # lbl = [pred1, pred2] if spin == 1 else [pred2, pred1]