def inference(model, rv, auxiliary_embs=None): pro_rv = Txtfile.process_sent(rv) rv_id = topic_encoder.word2idx(pro_rv) padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=margs.vocab.w2i[PADt]) inputs = Data2tensor.idx2tensor(padded_inp, torch.long, topic_encoder.device) with torch.no_grad(): model.eval() # inputs = [batch_size, sent_length] # auxiliary_embs = [batch_size, sent_length, aux_dim] emb_word = model.emb_layer(inputs, auxiliary_embs) # emb_word = [batch_size, sent_length, emb_dim] emb_sent = emb_word.mean(dim=1, keepdim=True) # emb_sent = [batch_size, 1, emb_dim] sent_length = emb_word.size(1) emb_sent_ex = emb_sent.expand(-1, sent_length, -1).contiguous() # emb_sent_ex = [batch_size, sent_length, emb_dim] alpha_score = model.attention(emb_word, emb_sent_ex) # alpha_score = [batch_size, sent_length, 1] alpha_norm = model.norm_attention(alpha_score) # alpha_norm = [batch_size, sent_length, 1] emb_attsent = torch.bmm(alpha_norm.transpose(1, 2), emb_word) # emb_attsent = [batch_size, 1, emb_dim] <------ # alpha_norm.transpose(1, 2) = [batch_size, 1, sent_length] dot emb_word = [batch_size, sent_length, emb_dim] emb_topic = model.encoder(emb_attsent.squeeze(1)) topic_class = model.norm_layer(emb_topic) # emb_topic = topic_class = [batch_size, nn_out_dim] label_prob, label_pred = topic_class.data.topk(topic_class.size(1)) return label_prob, label_pred
def train(self): train_data = Txtfile(self.args.train_file, firstline=False, word2idx=self.word2idx, limit=self.args.sent_limit) model_filename = os.path.join(args.model_dir, self.args.model_file) max_epochs = self.args.max_epochs epoch_start = time.time() for epoch in range(1, max_epochs + 1): print("Epoch: %s/%s" % (epoch, max_epochs)) train_loss = self.train_batch(train_data) print("UPDATES: - Train loss: %.4f" % train_loss) print(" - Save the model to %s at epoch %d" % (model_filename, epoch)) # Convert model to CPU to avoid out of GPU memory self.model.to("cpu") torch.save(self.model.state_dict(), model_filename) self.model.to(self.device) epoch_finish = Timer.timeEst(epoch_start, epoch / max_epochs) print("\nINFO: - Trained time (Remained time for %d epochs): %s" % (max_epochs - epoch, epoch_finish)) if self.args.decay_rate > 0: self.lr_decay(epoch) word_emb, enc_emb, dec_emb = self.model.get_embs() id2topic = {} for i in range(enc_emb.shape[0]): id2topic[i] = "topic_%d" % i Embeddings.save_embs(id2topic, dec_emb.transpose(), os.path.join(args.model_dir, self.args.dtopic_emb_file)) Embeddings.save_embs(id2topic, enc_emb, os.path.join(args.model_dir, self.args.etopic_emb_file)) Embeddings.save_embs(self.args.vocab.i2w, word_emb, os.path.join(args.model_dir, self.args.tuned_word_emb_file)) return
def predict(self, rv): pro_rv = Txtfile.process_sent(rv) rv_id = self.word2idx(pro_rv) padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=self.args.vocab.w2i[PADt]) inputs = Data2tensor.idx2tensor(padded_inp, torch.long, self.device) self.model.eval() with torch.no_grad(): label_prob, label_pred = self.model.inference(inputs) return label_prob, label_pred
rec_type = "LSTM" ntoken = len(vocab.w2i) nlabels = len(vocab.l2i) emb_size = 50 hidden_size = 64 nlayers = 2 dropout = 0.5 bidirect = False #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size) #print(embedding_matrix[5]) #embedding = nn.Embedding.from_pretrained(embedding_matrix) #input = torch.LongTensor([1]) #print(embedding(input)) train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx) # train_data = [sent[0] for sent in train_data] train_batch = vocab.minibatches_with_label(train_data, batch_size=batch_size) inpdata = [] outdata = [] for doc, label in train_batch: doc_pad_ids, doc_lengths = seqPAD.pad_sequences(doc, pad_tok=vocab.w2i[PAD]) doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, device) doc_lengths_tensor = Data2tensor.idx2tensor(doc_lengths, device) label_tensor = Data2tensor.idx2tensor(label, device) inpdata.append(doc_tensor) outdata.append(label_tensor) break
label_prob, label_pred = label_prob.data.topk(k) else: label_prob = torch.sigmoid(label_score.squeeze()) label_pred = (label_prob >= 0.5).data.long() return label_prob, label_pred if __name__ == "__main__": from data_utils import Data2tensor, Vocab, seqPAD, Txtfile filename = "../data/train.txt" vocab = Vocab(wl_th=None, cutoff=2) vocab.build([filename], firstline=False) word2idx = vocab.wd2idx(vocab.w2i) tag2idx = vocab.tag2idx(vocab.l2i) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx) train_iters = Vocab.minibatches(train_data, batch_size=4) data = [] label_ids = [] for words, labels in train_iters: data.append(words) label_ids.append(labels) word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=1024) w_tensor = Data2tensor.idx2tensor(word_ids) y_tensor = Data2tensor.idx2tensor(labels)
import random from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings Data2tensor.set_randseed(1234) use_cuda = torch.cuda.is_available() filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt" idf_file = "./idf.txt" vocab = Vocab(wl_th=None, wcutoff=5) vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000) word2idx = vocab.wd2idx(vocab_words=vocab.w2i, unk_words=True, se_words=False) train_data = Txtfile(filename, firstline=False, word2idx=word2idx, limit=100000) batch_size = 8 neg_sampling = 5 no_chunks = batch_size * (neg_sampling + 1) train_iters = Vocab.minibatches(train_data, batch_size=no_chunks) data = [] label = [] for inp_ids in train_iters: padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt]) data_tensor = Data2tensor.idx2tensor(padded_inp) # shuffle chunks perm_ids = torch.randperm(no_chunks) data_tensor = data_tensor[perm_ids] data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1)