コード例 #1
0
ファイル: serve.py プロジェクト: tin-chata/topic-modelling
def inference(model, rv, auxiliary_embs=None):
    pro_rv = Txtfile.process_sent(rv)
    rv_id = topic_encoder.word2idx(pro_rv)
    padded_inp, _ = seqPAD.pad_sequences([rv_id],
                                         pad_tok=margs.vocab.w2i[PADt])
    inputs = Data2tensor.idx2tensor(padded_inp, torch.long,
                                    topic_encoder.device)

    with torch.no_grad():
        model.eval()
        # inputs = [batch_size, sent_length]
        # auxiliary_embs = [batch_size, sent_length, aux_dim]
        emb_word = model.emb_layer(inputs, auxiliary_embs)
        # emb_word = [batch_size, sent_length, emb_dim]
        emb_sent = emb_word.mean(dim=1, keepdim=True)
        # emb_sent = [batch_size, 1, emb_dim]
        sent_length = emb_word.size(1)
        emb_sent_ex = emb_sent.expand(-1, sent_length, -1).contiguous()
        # emb_sent_ex = [batch_size, sent_length, emb_dim]
        alpha_score = model.attention(emb_word, emb_sent_ex)
        # alpha_score = [batch_size, sent_length, 1]
        alpha_norm = model.norm_attention(alpha_score)
        # alpha_norm = [batch_size, sent_length, 1]
        emb_attsent = torch.bmm(alpha_norm.transpose(1, 2), emb_word)
        # emb_attsent = [batch_size, 1, emb_dim] <------
        # alpha_norm.transpose(1, 2) = [batch_size, 1, sent_length] dot emb_word = [batch_size, sent_length, emb_dim]
        emb_topic = model.encoder(emb_attsent.squeeze(1))
        topic_class = model.norm_layer(emb_topic)
        # emb_topic = topic_class = [batch_size, nn_out_dim]
        label_prob, label_pred = topic_class.data.topk(topic_class.size(1))
        return label_prob, label_pred
コード例 #2
0
    def train(self):
        train_data = Txtfile(self.args.train_file, firstline=False, word2idx=self.word2idx, limit=self.args.sent_limit)
        model_filename = os.path.join(args.model_dir, self.args.model_file)
        max_epochs = self.args.max_epochs
        epoch_start = time.time()
        for epoch in range(1, max_epochs + 1):
            print("Epoch: %s/%s" % (epoch, max_epochs))
            train_loss = self.train_batch(train_data)

            print("UPDATES: - Train loss: %.4f" % train_loss)
            print("         - Save the model to %s at epoch %d" % (model_filename, epoch))
            # Convert model to CPU to avoid out of GPU memory
            self.model.to("cpu")
            torch.save(self.model.state_dict(), model_filename)
            self.model.to(self.device)

            epoch_finish = Timer.timeEst(epoch_start, epoch / max_epochs)
            print("\nINFO: - Trained time (Remained time for %d epochs): %s" % (max_epochs - epoch, epoch_finish))

            if self.args.decay_rate > 0:
                self.lr_decay(epoch)

        word_emb, enc_emb, dec_emb = self.model.get_embs()
        id2topic = {}
        for i in range(enc_emb.shape[0]):
            id2topic[i] = "topic_%d" % i

        Embeddings.save_embs(id2topic, dec_emb.transpose(), os.path.join(args.model_dir, self.args.dtopic_emb_file))
        Embeddings.save_embs(id2topic, enc_emb, os.path.join(args.model_dir, self.args.etopic_emb_file))
        Embeddings.save_embs(self.args.vocab.i2w, word_emb, os.path.join(args.model_dir, self.args.tuned_word_emb_file))
        return
コード例 #3
0
 def predict(self, rv):
     pro_rv = Txtfile.process_sent(rv)
     rv_id = self.word2idx(pro_rv)
     padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=self.args.vocab.w2i[PADt])
     inputs = Data2tensor.idx2tensor(padded_inp, torch.long, self.device)
     self.model.eval()
     with torch.no_grad():
         label_prob, label_pred = self.model.inference(inputs)
         return label_prob, label_pred
コード例 #4
0
    rec_type = "LSTM"
    ntoken = len(vocab.w2i)
    nlabels = len(vocab.l2i)
    emb_size = 50
    hidden_size = 64
    nlayers = 2
    dropout = 0.5
    bidirect = False

    #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size)
    #print(embedding_matrix[5])
    #embedding = nn.Embedding.from_pretrained(embedding_matrix)
    #input = torch.LongTensor([1])
    #print(embedding(input))
    train_data = Txtfile(data_files[0],
                         firstline=False,
                         source2idx=word2idx,
                         label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches_with_label(train_data,
                                               batch_size=batch_size)
    inpdata = []
    outdata = []
    for doc, label in train_batch:
        doc_pad_ids, doc_lengths = seqPAD.pad_sequences(doc,
                                                        pad_tok=vocab.w2i[PAD])
        doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, device)
        doc_lengths_tensor = Data2tensor.idx2tensor(doc_lengths, device)
        label_tensor = Data2tensor.idx2tensor(label, device)
        inpdata.append(doc_tensor)
        outdata.append(label_tensor)
        break
コード例 #5
0
            label_prob, label_pred = label_prob.data.topk(k)
        else:
            label_prob = torch.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile
    filename = "../data/train.txt"
    vocab = Vocab(wl_th=None, cutoff=2)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=4)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                          pad_tok=0,
                                                          wthres=1024)

    w_tensor = Data2tensor.idx2tensor(word_ids)
    y_tensor = Data2tensor.idx2tensor(labels)
コード例 #6
0
    import random
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings
    Data2tensor.set_randseed(1234)
    use_cuda = torch.cuda.is_available()
    filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt"
    idf_file = "./idf.txt"

    vocab = Vocab(wl_th=None, wcutoff=5)
    vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000)

    word2idx = vocab.wd2idx(vocab_words=vocab.w2i,
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)

    batch_size = 8
    neg_sampling = 5
    no_chunks = batch_size * (neg_sampling + 1)
    train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
    data = []
    label = []
    for inp_ids in train_iters:
        padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt])
        data_tensor = Data2tensor.idx2tensor(padded_inp)
        # shuffle chunks
        perm_ids = torch.randperm(no_chunks)
        data_tensor = data_tensor[perm_ids]
        data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1)