Beispiel #1
0
def inference(model, rv, auxiliary_embs=None):
    pro_rv = Txtfile.process_sent(rv)
    rv_id = topic_encoder.word2idx(pro_rv)
    padded_inp, _ = seqPAD.pad_sequences([rv_id],
                                         pad_tok=margs.vocab.w2i[PADt])
    inputs = Data2tensor.idx2tensor(padded_inp, torch.long,
                                    topic_encoder.device)

    with torch.no_grad():
        model.eval()
        # inputs = [batch_size, sent_length]
        # auxiliary_embs = [batch_size, sent_length, aux_dim]
        emb_word = model.emb_layer(inputs, auxiliary_embs)
        # emb_word = [batch_size, sent_length, emb_dim]
        emb_sent = emb_word.mean(dim=1, keepdim=True)
        # emb_sent = [batch_size, 1, emb_dim]
        sent_length = emb_word.size(1)
        emb_sent_ex = emb_sent.expand(-1, sent_length, -1).contiguous()
        # emb_sent_ex = [batch_size, sent_length, emb_dim]
        alpha_score = model.attention(emb_word, emb_sent_ex)
        # alpha_score = [batch_size, sent_length, 1]
        alpha_norm = model.norm_attention(alpha_score)
        # alpha_norm = [batch_size, sent_length, 1]
        emb_attsent = torch.bmm(alpha_norm.transpose(1, 2), emb_word)
        # emb_attsent = [batch_size, 1, emb_dim] <------
        # alpha_norm.transpose(1, 2) = [batch_size, 1, sent_length] dot emb_word = [batch_size, sent_length, emb_dim]
        emb_topic = model.encoder(emb_attsent.squeeze(1))
        topic_class = model.norm_layer(emb_topic)
        # emb_topic = topic_class = [batch_size, nn_out_dim]
        label_prob, label_pred = topic_class.data.topk(topic_class.size(1))
        return label_prob, label_pred
Beispiel #2
0
    def train_batch(self, train_data):
        clip_rate = self.args.clip
        chunk_size = self.args.batch_size * (self.args.neg_samples + 1)
        total_batch = self.args.vocab.nodocs // chunk_size
        prog = Progbar(target=total_batch)

        # set model in train model
        train_loss = []
        self.model.train()

        for i, inp_ids in enumerate(self.args.vocab.minibatches(train_data, batch_size=chunk_size)):
            padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=self.args.vocab.w2i[PADt])
            data_tensor = Data2tensor.idx2tensor(padded_inp, torch.long, self.device)

            # shuffle data_chunks
            perm_ids = torch.randperm(chunk_size)
            data_tensor = data_tensor[perm_ids]
            data_tensor = data_tensor.view(self.args.batch_size, self.args.neg_samples + 1, -1)
            # data_tensor = [batch_size, 1 + neg_sampling, word_length]
            inp_tensor = data_tensor[:, 0, :]
            noise_tensor = data_tensor[:, 1:, :]

            self.model.zero_grad()
            emb_sent, trans_sent, emb_noise = self.model(inp_tensor, noise_tensor)

            batch_loss = self.model.batchHingeLoss(emb_sent, trans_sent, emb_noise)
            train_loss.append(batch_loss.item())

            batch_loss.backward()
            if clip_rate > 0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate)
            self.optimizer.step()
            prog.update(i + 1, [("Train loss", batch_loss.item())])

        return np.mean(train_loss)
Beispiel #3
0
 def predict(self, rv):
     pro_rv = Txtfile.process_sent(rv)
     rv_id = self.word2idx(pro_rv)
     padded_inp, _ = seqPAD.pad_sequences([rv_id], pad_tok=self.args.vocab.w2i[PADt])
     inputs = Data2tensor.idx2tensor(padded_inp, torch.long, self.device)
     self.model.eval()
     with torch.no_grad():
         label_prob, label_pred = self.model.inference(inputs)
         return label_prob, label_pred
Beispiel #4
0
        return label_prob, label_pred


if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, Csvfile
    filename = "/media/data/langID/small_scale/train.csv"
    vocab = Vocab(cl_th=None, cutoff=1, c_lower=False, c_norm=False)
    vocab.build([filename], firstline=False)
    word2idx = vocab.wd2idx(vocab.c2i)
    tag2idx = vocab.tag2idx(vocab.l2i)
    train_data = Csvfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         tag2idx=tag2idx)

    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data = []
    label_ids = []
    for words, labels in train_iters:
        data.append(words)
        label_ids.append(labels)
        word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                          pad_tok=0,
                                                          wthres=1024,
                                                          cthres=32)

    w_tensor = Data2tensor.idx2tensor(word_ids)
    y_tensor = Data2tensor.idx2tensor(labels)

    data_tensors = Data2tensor.sort_tensors(labels, word_ids, sequence_lengths)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
Beispiel #5
0
    #embedding_matrix=create_embedding_matrix(vocab,ntoken,emb_size)
    #print(embedding_matrix[5])
    #embedding = nn.Embedding.from_pretrained(embedding_matrix)
    #input = torch.LongTensor([1])
    #print(embedding(input))
    train_data = Txtfile(data_files[0],
                         firstline=False,
                         source2idx=word2idx,
                         label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches_with_label(train_data,
                                               batch_size=batch_size)
    inpdata = []
    outdata = []
    for doc, label in train_batch:
        doc_pad_ids, doc_lengths = seqPAD.pad_sequences(doc,
                                                        pad_tok=vocab.w2i[PAD])
        doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, device)
        doc_lengths_tensor = Data2tensor.idx2tensor(doc_lengths, device)
        label_tensor = Data2tensor.idx2tensor(label, device)
        inpdata.append(doc_tensor)
        outdata.append(label_tensor)
        break

    # model = RNNModule(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers,
    #                   dropout=dropout, bidirect=bidirect).to(device)
    # rec_output, rec_hidden, rec_output = model(input_tensor, input_lens_tensor)
    #
    # model = UniLSTMModel(rec_type=rec_type, ntokens=ntoken, emb_size=emb_size, hidden_size=hidden_size, nlayers=nlayers,
    #                      dropout=dropout, bidirect=False, nlabels=nlabels).to(device)
    # decoded_scores, rec_hidden, rec_output = model(input_tensor, input_lens_tensor)
    model = BiLSTMModel(rec_type=rec_type,
Beispiel #6
0
        return label_score
    
if __name__ == "__main__":
    from data_utils import Data2tensor, Vocab, seqPAD, CoNLLDataset
    train_file='/media/data/NER/conll03/conll03/train.bmes'
    dev_file='/media/data/NER/conll03/conll03/dev.bmes'
    test_file='/media/data/NER/conll03/conll03/test.bmes'
    vocab = Vocab(cutoff=1, wl_th=None, cl_th=None, w_lower=False, w_norm=False, c_lower=False, c_norm=False)
    vocab.build([train_file, dev_file, test_file])
    
    
    word2idx = vocab.wd2idx(vocab_words=vocab.w2i, vocab_chars=vocab.c2i, allow_unk=True, start_end=True)
    tag2idx = vocab.tag2idx(vocab_tags=vocab.l2i, start_end=True)
    train_data = CoNLLDataset(train_file, word2idx=word2idx, tag2idx=tag2idx)
    train_iters = Vocab.minibatches(train_data, batch_size=10)
    data=[]
    label_ids = []
    for words, labels in train_iters:
        char_ids, word_ids = zip(*words)
        data.append(words)
        word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids, pad_tok=0, wthres=1024, cthres=32)
        char_ids, word_lengths = seqPAD.pad_sequences(char_ids, pad_tok=0, nlevels=2, wthres=1024, cthres=32)
        label_ids, label_lengths = seqPAD.pad_sequences(labels, pad_tok=0, wthres=1024, cthres=32)
    
    w_tensor=Data2tensor.idx2tensor(word_ids)
    c_tensor=Data2tensor.idx2tensor(char_ids)
    y_tensor=Data2tensor.idx2tensor(label_ids)
    
    data_tensor = Data2tensor.sort_tensors(label_ids, word_ids, sequence_lengths, char_ids, word_lengths, volatile_flag=False)
    label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensor
    
                            unk_words=True,
                            se_words=False)

    train_data = Txtfile(filename,
                         firstline=False,
                         word2idx=word2idx,
                         limit=100000)

    batch_size = 8
    neg_sampling = 5
    no_chunks = batch_size * (neg_sampling + 1)
    train_iters = Vocab.minibatches(train_data, batch_size=no_chunks)
    data = []
    label = []
    for inp_ids in train_iters:
        padded_inp, _ = seqPAD.pad_sequences(inp_ids, pad_tok=vocab.w2i[PADt])
        data_tensor = Data2tensor.idx2tensor(padded_inp)
        # shuffle chunks
        perm_ids = torch.randperm(no_chunks)
        data_tensor = data_tensor[perm_ids]
        data_tensor = data_tensor.view(batch_size, neg_sampling + 1, -1)
        inp_tensor = data_tensor[:, 0, :]
        noise_tensor = data_tensor[:, 1:, :]
        break

    emb_size = len(vocab.w2i)
    emb_dim = 100
    pre_embs = None
    emb_drop_rate = 0.5
    emb_zero_padding = False
    grad_flag = True
Beispiel #8
0
    bptt = 10

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    data_files = ["../dataset/train.txt"]
    vocab = Vocab(wl_th=wl_th, cutoff=cutoff)
    vocab.build(data_files, firstline=False)
    word2idx = vocab.wd2idx(vocab.w2i)
    label2idx = vocab.tag2idx(vocab.l2i)

    train_data = Txtfile(data_files[0], firstline=False, source2idx=word2idx, label2idx=label2idx)
    # train_data = [sent[0] for sent in train_data]
    train_batch = vocab.minibatches(train_data, batch_size=batch_size)
    inpdata=[]
    outdata=[]
    for sent in train_batch:
        word_pad_ids, seq_lens = seqPAD.pad_sequences(sent, pad_tok=vocab.w2i[PAD])
        data_tensor = Data2tensor.idx2tensor(word_pad_ids)
        for i in range(0, data_tensor.size(1)-1, bptt):
            data, target = vocab.bptt_batch(data_tensor, i, bptt)
            inpdata.append(data)
            outdata.append(target)
        break

    rnn_type = "GRU"
    ntoken = len(vocab.w2i)
    ninp = 32
    nhid = 64
    nlayers = 1
    dropout = 0.5
    tie_weights = False
    bidirect = False