Example #1
0
def inference(model, rv, auxiliary_embs=None):
    pro_rv = Txtfile.process_sent(rv)
    rv_id = topic_encoder.word2idx(pro_rv)
    padded_inp, _ = seqPAD.pad_sequences([rv_id],
                                         pad_tok=margs.vocab.w2i[PADt])
    inputs = Data2tensor.idx2tensor(padded_inp, torch.long,
                                    topic_encoder.device)

    with torch.no_grad():
        model.eval()
        # inputs = [batch_size, sent_length]
        # auxiliary_embs = [batch_size, sent_length, aux_dim]
        emb_word = model.emb_layer(inputs, auxiliary_embs)
        # emb_word = [batch_size, sent_length, emb_dim]
        emb_sent = emb_word.mean(dim=1, keepdim=True)
        # emb_sent = [batch_size, 1, emb_dim]
        sent_length = emb_word.size(1)
        emb_sent_ex = emb_sent.expand(-1, sent_length, -1).contiguous()
        # emb_sent_ex = [batch_size, sent_length, emb_dim]
        alpha_score = model.attention(emb_word, emb_sent_ex)
        # alpha_score = [batch_size, sent_length, 1]
        alpha_norm = model.norm_attention(alpha_score)
        # alpha_norm = [batch_size, sent_length, 1]
        emb_attsent = torch.bmm(alpha_norm.transpose(1, 2), emb_word)
        # emb_attsent = [batch_size, 1, emb_dim] <------
        # alpha_norm.transpose(1, 2) = [batch_size, 1, sent_length] dot emb_word = [batch_size, sent_length, emb_dim]
        emb_topic = model.encoder(emb_attsent.squeeze(1))
        topic_class = model.norm_layer(emb_topic)
        # emb_topic = topic_class = [batch_size, nn_out_dim]
        label_prob, label_pred = topic_class.data.topk(topic_class.size(1))
        return label_prob, label_pred
Example #2
0
    def train_batch(self, train_data):
        clip_rate = self.args.clip
        chunk_size = self.args.batch_size * (self.args.neg_samples + 1)
        total_batch = self.args.vocab.nodocs // chunk_size
        prog = Progbar(target=total_batch)

        # set model in train model
        train_loss = []
        self.model.train()

        for i, inp_ids in enumerate(self.args.vocab.minibatches(train_data, batch_size=chunk_size)):
            padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=self.args.vocab.w2i[PADt])
            data_tensor = Data2tensor.idx2tensor(padded_inp, torch.long, self.device)

            # shuffle data_chunks
            perm_ids = torch.randperm(chunk_size)
            data_tensor = data_tensor[perm_ids]
            data_tensor = data_tensor.view(self.args.batch_size, self.args.neg_samples + 1, -1)
            # data_tensor = [batch_size, 1 + neg_sampling, word_length]
            inp_tensor = data_tensor[:, 0, :]
            noise_tensor = data_tensor[:, 1:, :]

            self.model.zero_grad()
            emb_sent, trans_sent, emb_noise = self.model(inp_tensor, noise_tensor)

            batch_loss = self.model.batchHingeLoss(emb_sent, trans_sent, emb_noise)
            train_loss.append(batch_loss.item())

            batch_loss.backward()
            if clip_rate > 0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate)
            self.optimizer.step()
            prog.update(i + 1, [("Train loss", batch_loss.item())])

        return np.mean(train_loss)
Example #3
0
 def predict(self, rv):
     pro_rv = Txtfile.process_sent(rv)
     rv_id = self.word2idx(pro_rv)
     padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=self.args.vocab.w2i[PADt])
     inputs = Data2tensor.idx2tensor(padded_inp, torch.long, self.device)
     self.model.eval()
     with torch.no_grad():
         label_prob, label_pred = self.model.inference(inputs)
         return label_prob, label_pred
Example #4
0
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile
    filename = "/media/data/langID/small_scale/train.csv"
    vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.c2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                          pad_tok=0,
                                                          wthres=1024,
                                                          cthres=32)

    w_tensor = Data2tensor.idx2tensor(word_ids)
    y_tensor = Data2tensor.idx2tensor(labels)

    data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
Example #5
0
    #embedding = nn.Embedding.from_pretrained(embedding_matrix)
    #input = torch.LongTensor([1])
    #print(embedding(input))
    train_data = Txtfile(data_files[0],
                         firstline=False,
                         source2idx=word2idx,
                         label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches_with_label(train_data,
                                               batch_size=batch_size)
    inpdata = []
    outdata = []
    for doc, label in train_batch:
        doc_pad_ids, doc_lengths = seqPAD.pad_sequences(doc,
                                                        pad_tok=vocab.w2i[PAD])
        doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, device)
        doc_lengths_tensor = Data2tensor.idx2tensor(doc_lengths, device)
        label_tensor = Data2tensor.idx2tensor(label, device)
        inpdata.append(doc_tensor)
        outdata.append(label_tensor)
        break

    # model = RNNModule(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers,
    #                   dropout=dropout, bidirect=bidirect).to(device)
    # rec_output, rec_hidden, rec_output = model(input_tensor, input_lens_tensor)
    #
    # model = UniLSTMModel(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers,
    #                      dropout=dropout, bidirect=False, nlabels=nlabels).to(device)
    # decoded_scores, rec_hidden, rec_output = model(input_tensor, input_lens_tensor)
    model = BiLSTMModel(rec_type=rec_type,
                        ntokens=ntoken,
Example #6
0
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
    
    w_tensor=Data2tensor.idx2tensor(word_ids)
    c_tensor=Data2tensor.idx2tensor(char_ids)
    y_tensor=Data2tensor.idx2tensor(label_ids)
    
    data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensor
    
Example #7
0
        batch_size, emb_dim = trans_sent.size()
        pred_score = torch.bmm(emb_noise,
                               trans_sent.view(batch_size, emb_dim,
                                               1)).squeeze(-1)
        # pred_score = [batch_size, sampling]
        distance = 1 + pred_score - y_score.view(-1, 1)
        abs_distance = torch.max(distance, torch.zeros_like(distance))
        ranking = abs_distance.sum(-1)
        reg = self.regularized()
        return ranking.mean() + reg


if __name__ == "__main__":
    import random
    from data_utils import Data2tensor, Vocab, seqPAD, Txtfile, PADt, Embeddings
    Data2tensor.set_randseed(1234)
    use_cuda = torch.cuda.is_available()
    filename = "/media/data/restaurants/yelp_dataset/processed/extracted_rev/yelp_data_rev.pro.txt"
    idf_file = "./idf.txt"

    vocab = Vocab(wl_th=None, wcutoff=5)
    vocab.build(filename, idf_file=idf_file, firstline=False, limit=100000)

    word2idx = vocab.wd2idx(vocab_words=vocab.w2i,
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)
Example #8
0
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile

    filename = "/media/data/aspectSA/train_v2.csv"
    vocab = Vocab(wl_th=None, cutoff=1)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename, firstline=False, word2idx=word2idx, tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=4)
    data = []
    asp_locs = []
    label_ids = []
    for words, asps, labels in train_iters:
        data.append(words)
        asp_locs.append(asps)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=1024)

    w_tensor = Data2tensor.idx2tensor(word_ids)
    asp_tensor = Data2tensor.idx2tensor(asps)
    idx_arrange = Data2tensor.idx2tensor(list(range(w_tensor.size(0))))
    # w_tensor[idx_arrange, asp_tensor]
    y_tensor = Data2tensor.idx2tensor(labels)

    data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
Example #9
0
"""
Created on 2019-01-07
@author: duytinvo
"""
import os
import time
import torch
import argparse
import numpy as np
import torch.optim as optim
from core_nns import Autoencoder
from other_utils import Progbar, Timer, SaveloadHP
from data_utils import Vocab, Data2tensor, Txtfile, seqPAD, Embeddings, PADt

Data2tensor.set_randseed(1234)


class Autoencoder_model(object):
    def __init__(self, args=None):

        self.args = args
        self.device = torch.device("cuda:0" if self.args.use_cuda else "cpu")

        self.word2idx = self.args.vocab.wd2idx(vocab_words=self.args.vocab.w2i, unk_words=True,
                                               se_words=self.args.start_end)

        word_HPs = [len(self.args.vocab.w2i), self.args.word_dim, self.args.word_pretrained, self.args.word_drop_rate,
                    self.args.word_zero_padding, self.args.grad_flag, self.args.word_nn_out_dim]

        self.model = Autoencoder(HPs=word_HPs, kmean_file=self.args.kmean_file).to(self.device)
Example #10
0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches(train_data, batch_size=batch_size)
    inpdata=[]
    outdata=[]
    for sent in train_batch:
        word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD])
        data_tensor = Data2tensor.idx2tensor(word_pad_ids)
        for i in range(0, data_tensor.size(1)-1, bptt):
            data, target = vocab.bptt_batch(data_tensor, i, bptt)
            inpdata.append(data)
            outdata.append(target)
        break

    rnn_type = "GRU"
    ntoken = len(vocab.w2i)
    ninp = 32
    nhid = 64
    nlayers = 1
    dropout = 0.5
    tie_weights = False
    bidirect = False