Ejemplo n.º 1
0
    def predict(self, sent, asp="null", k=1):
        """

        :param sent: processed sentence
        :param asp: an aspect mentioned inside sent
        :param k: int
        :return: top k predictions
        """
        wl = self.args.vocab.wl
        ## set model in eval model
        self.model.eval()

        fake_label = [0]
        words, asp_loc = self.word2idx(sent, asp)
        word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                          pad_tok=0,
                                                          wthres=wl)

        data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc],
                                                word_ids, sequence_lengths,
                                                self.device)
        fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
        arange_tensor = Data2tensor.idx2tensor(
            list(range(word_tensor.size(0))), self.device)

        label_score = self.model(word_tensor, sequence_lengths, aspect_tensor,
                                 arange_tensor)
        label_prob, label_pred = self.model.inference(label_score, k)
        return label_prob, label_pred
Ejemplo n.º 2
0
    def evaluate_batch(self, eva_data):
        with torch.no_grad():
            wl = self.args.vocab.wl
            batch_size = self.args.batch_size
            ## set model in eval model
            self.model.eval()
            start = time.time()
            y_true = Data2tensor.idx2tensor([], self.device)
            y_pred = Data2tensor.idx2tensor([], self.device)
            for i, (words, label_ids) in enumerate(
                    self.args.vocab.minibatches(eva_data,
                                                batch_size=batch_size)):
                word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                                  pad_tok=0,
                                                                  wthres=wl)

                data_tensors = Data2tensor.sort_tensors(
                    label_ids, word_ids, sequence_lengths, self.device)
                label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

                y_true = torch.cat([y_true, label_tensor])
                label_score = self.model(word_tensor, sequence_lengths)
                label_prob, label_pred = self.model.inference(label_score, k=1)

                y_pred = torch.cat([y_pred, label_pred])
            #measures = Classifier.class_metrics(y_true, y_pred.squeeze())
            measures = Classifier.class_metrics(
                y_true.data.cpu().numpy(),
                y_pred.squeeze().data.cpu().numpy())

            end = time.time() - start
            speed = len(y_true) / end
        return measures, speed
Ejemplo n.º 3
0
 def predict(self, doc="", topk=5):
     """
     Inputs:
         doc: a document
         topk: number of recommended tokens
     Outputs:
         A list form of predicted labels and their probabilities
             e,g, [('5_star', 0.2020701915025711),
                  ('3_star', 0.2010505348443985),
                  ('2_star', 0.2006799429655075),
                  ('1_star', 0.1990940123796463),
                  ('4_star', 0.1971053034067154)]
     """
     doc_ids = self.word2idx(doc.split())
     #######################
     # YOUR CODE STARTS HERE
     pred_lb, pred_probs = None, None
     #convert to tensor
     doc_tensor = Data2tensor.idx2tensor(doc_ids)
     doc_lengths_tensor = Data2tensor.idx2tensor(len(doc_tensor))
     #call the model
     output, _, _ = self.model(doc_tensor.unsqueeze(0),
                               doc_lengths_tensor.unsqueeze(0))
     #get the probablities and predicted label
     pred_probs, pred_lb = self.model.inference(output, topk)
     #applied and to list to get individual element out of tensor
     #as a tensor value cannot be compared with the index positon of labels in args.vocab.i2l
     pred_probs = pred_probs.flatten().tolist()
     pred_lb = pred_lb.flatten().tolist()
     #get label information for the predicted output
     pred_lb = [self.args.vocab.i2l[x] for x in pred_lb]
     # YOUR CODE ENDS HERE
     #######################
     return list(zip(pred_lb, pred_probs))
Ejemplo n.º 4
0
    def train_batch(self,train_data):
        wl = self.args.vocab.wl
        clip_rate = self.args.clip
        
        batch_size = self.args.batch_size
        num_train = len(train_data)
        total_batch = num_train//batch_size+1
        prog = Progbar(target=total_batch)
        ## set model in train model
        self.model.train()
        train_loss = []
        for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(train_data, batch_size=batch_size)):
            word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=wl)

            data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths,self.device)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors

            self.model.zero_grad()
            label_score = self.model(word_tensor, sequence_lengths)
            # print("inside training batch, ", label_score.size(), label_tensor.size(), label_score, label_tensor)
            batch_loss = self.model.NLL_loss(label_score, label_tensor)
            train_loss.append(batch_loss.item())
            
            batch_loss.backward()
            
            if clip_rate>0:
                torch.nn.utils.clip_grad_norm_(self.model.parameters(), clip_rate)
                
            self.optimizer.step()
            
            prog.update(i + 1, [("Train loss", batch_loss.item())])
        return np.mean(train_loss)
Ejemplo n.º 5
0
    def evaluate_batch(self, eval_data):
        start_time = time.time()
        eval_batch = self.args.vocab.minibatches(
            eval_data, batch_size=self.args.batch_size)
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0.
        total_word = 0
        with torch.no_grad():
            for seq_batch in eval_batch:
                word_pad_ids, seq_lens = seqPAD.pad_sequences(
                    seq_batch, pad_tok=self.args.vocab.w2i[PAD])
                seq_tensor = Data2tensor.idx2tensor(word_pad_ids, self.device)
                hidden = self.model.init_hidden(seq_tensor.size(0))
                for i in range(0, seq_tensor.size(1) - 1, self.args.bptt):
                    data, target = self.bptt_batch(seq_tensor, i)
                    mask_target = target > 0
                    output, hidden = self.model(data, hidden)
                    batch_loss = self.model.NLL_loss(output, target)
                    total_loss += batch_loss.item()
                    hidden = self.repackage_hidden(hidden)
                    total_word = total_word + mask_target.sum().item()

        cur_loss = total_loss / total_word
        elapsed = time.time() - start_time
        print('-' * 89)
        print('| EVALUATION | words {:5d} | lr {:02.2f} | words/s {:5.2f} | '
              'loss {:5.2f} | ppl {:8.2f}'.format(total_word, self.args.lr,
                                                  total_word / elapsed,
                                                  cur_loss,
                                                  math.exp(cur_loss)))
        print('-' * 89)
        return cur_loss, total_word, elapsed
Ejemplo n.º 6
0
    def recommend(self, context="", topk=5):
        """
        Inputs:
            context: the text form of given context
            topk: number of recommended tokens
        Outputs:
            A list form of recommended words and their probabilities
                e,g, [('i', 0.044447630643844604),
                     ('it', 0.027285737916827202),
                     ("don't", 0.026111900806427002),
                     ('will', 0.023868300020694733),
                     ('had', 0.02248169668018818)]
        """
        rec_wds, rec_probs = [], []
        #######################
        # YOUR CODE STARTS HERE
        #get the data from args.vocab.i2w
        data = self.args.vocab.i2w
        #get the context data for which the topk values need to be fetched
        context = context.split()
        #split the context into tokens
        #get the index of the words in the context
        idx = []
        for i in context:
            #print(i)
            for j in range(0, len(data)):
                #if word is present in data then append its index position in idx list
                if i == data[j]:
                    idx.append(j)

        #print("index",idx)
        #convert the index to the tensor
        idx = Data2tensor.idx2tensor(idx)
        #print(idx)
        prob = 0
        label = 0
        batch_size = 1
        hidden = self.model.init_hidden(batch_size)
        #print(hidden)
        output, hidden = self.model.forward(idx.reshape(1, -1), hidden)
        #print(hidden)
        #get the topk words and their probablities
        p, l = self.model.inference(output, topk)
        prob = list(p[0][-1])
        label = list(l[0][-1])
        #print(prob,label)
        #the below list comprehension
        #is used to get words based on the index position in label
        #it matches the values stored in label which are index position
        #so the values in label are matched wiht values in data
        #the word residing at that index position is returned and appended to rec_wds list
        rec_wds += [data[k.item()] for k in label]
        #the prob list contains tensor so item is used to get a number and not a tensor
        #so in below list comprehension p.item() or tensor.item() returns number and not a tensor
        rec_probs += [k.item() for k in prob]
        # YOUR CODE ENDS HERE
        #######################
        return list(zip(rec_wds, rec_probs))
Ejemplo n.º 7
0
    def evaluate_batch(self, eval_data):
        start_time = time.time()
        eval_batch = self.args.vocab.minibatches_with_label(
            eval_data, batch_size=self.args.batch_size)
        # Turn on evaluation mode which disables dropout.
        self.model.eval()
        total_loss = 0.
        total_docs = 0
        y_true, y_pred = [], []
        with torch.no_grad():
            for doc_batch, lb_batch in eval_batch:
                doc_pad_ids, doc_lengths = seqPAD.pad_sequences(
                    doc_batch, pad_tok=self.args.vocab.w2i[PAD])
                #######################
                # YOUR CODE STARTS HERE
                doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, self.device)
                doc_lengths_tensor = Data2tensor.idx2tensor(
                    doc_lengths, self.device)
                lb_tensor = Data2tensor.idx2tensor(lb_batch, self.device)
                total_docs += doc_tensor.size(0)
                output, _, _ = self.model(doc_tensor, doc_lengths_tensor)
                loss = self.model.NLL_loss(output, lb_tensor)
                label_prob, label_pred = self.model.inference(output, k=1)
                #print("shape label_tensor",lb_tensor.shape)
                #print("shape label_pred",label_pred.squeeze(1).shape)
                y_true.extend(lb_tensor)
                y_pred.extend(label_pred.squeeze(1))
                total_loss += loss.item()
                # YOUR CODE ENDS HERE
                #######################

        precision, recall, f1, acc = Sentimentmodel.cal_metrics(y_true, y_pred)
        cur_loss = total_loss / total_docs
        elapsed = time.time() - start_time
        metrics = {
            "precision": precision * 100,
            "recall": recall * 100,
            "f1": f1 * 100,
            "acc": acc * 100,
            "loss": cur_loss
        }
        return metrics, total_docs, elapsed
Ejemplo n.º 8
0
    def train_batch(self, train_data):
        total_loss = 0.
        total_docs = 0
        start_time = time.time()
        train_batch = self.args.vocab.minibatches_with_label(
            train_data, batch_size=self.args.batch_size)
        # Turn on training mode which enables dropout.
        self.model.train()
        for batch, (doc_batch, lb_batch) in enumerate(train_batch):
            doc_pad_ids, doc_lengths = seqPAD.pad_sequences(
                doc_batch, pad_tok=self.args.vocab.w2i[PAD])
            doc_tensor = Data2tensor.idx2tensor(doc_pad_ids, self.device)
            doc_lengths_tensor = Data2tensor.idx2tensor(
                doc_lengths, self.device)
            lb_tensor = Data2tensor.idx2tensor(lb_batch, self.device)
            # doc_tensor = [batch_size, max_doc_length]
            total_docs += doc_tensor.size(0)

            self.model.zero_grad()
            output, _, _ = self.model(doc_tensor, doc_lengths_tensor)
            loss = self.model.NLL_loss(output, lb_tensor)
            avg_loss = loss / doc_tensor.size(0)
            avg_loss.backward()

            # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
            torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                           self.args.clip)

            # update parameters in all sub-graphs
            self.model_optimizer.step()
            # for p in self.model.parameters():
            #     p.data.add_(p.grad.data, alpha=-self.args.lr)

            total_loss += loss.item()

        cur_loss = total_loss / total_docs
        elapsed = time.time() - start_time
        # print('-' * 89)
        # print('| TRAINING | epoch {:3d} | documents {:5d} | lr {:02.2f} | documents/s {:5.2f} | '
        #       'loss {:5.2f}'.format(epoch, total_docs, self.args.lr, total_docs / elapsed, cur_loss))
        # print('-' * 89)
        return cur_loss, total_docs, elapsed
Ejemplo n.º 9
0
    def train_batch(self, train_data):
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl
        clip_rate = self.args.clip

        batch_size = self.args.batch_size
        num_train = len(train_data)
        total_batch = num_train // batch_size + 1
        prog = Progbar(target=total_batch)
        ## set model in train model
        self.model.train()
        train_loss = []
        for i, (words, label_ids) in enumerate(
                self.args.vocab.minibatches(train_data,
                                            batch_size=batch_size)):
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids,
                                                              pad_tok=0,
                                                              wthres=wl,
                                                              cthres=cl)
            char_ids, word_lengths = seqPAD.pad_sequences(char_ids,
                                                          pad_tok=0,
                                                          nlevels=2,
                                                          wthres=wl,
                                                          cthres=cl)
            label_ids, _ = seqPAD.pad_sequences(label_ids,
                                                pad_tok=0,
                                                wthres=wl,
                                                cthres=cl)

            data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,
                                                    sequence_lengths, char_ids,
                                                    word_lengths)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
            mask_tensor = word_tensor > 0

            label_score = self.model(word_tensor, sequence_lengths,
                                     char_tensor, word_lengths,
                                     char_seq_recover)

            batch_loss = self.model.NLL_loss(label_score, mask_tensor,
                                             label_tensor)

            train_loss.append(batch_loss.data.tolist()[0])
            self.model.zero_grad()
            batch_loss.backward()
            if clip_rate > 0:
                torch.nn.utils.clip_grad_norm(self.model.parameters(),
                                              clip_rate)
            self.optimizer.step()

            prog.update(i + 1, [("Train loss", batch_loss.data.tolist()[0])])
        return np.mean(train_loss)
Ejemplo n.º 10
0
def predict_null(classifier, sent, asp, i2l):
    from utils.data_utils import Data2tensor, seqPAD
    wl = classifier.args.vocab.wl
    ## set model in eval model
    classifier.model.eval()
    fake_label = [0]
    words, asp_loc = classifier.word2idx(sent, asp)
    word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                      pad_tok=0,
                                                      wthres=wl)
    data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc], word_ids,
                                            sequence_lengths,
                                            classifier.device)
    fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
    arange_tensor = Data2tensor.idx2tensor(list(range(word_tensor.size(0))),
                                           classifier.device)
    word_h_n = classifier.model.rnn.get_all_hiddens(word_tensor,
                                                    sequence_lengths).mean(1)
    label_score = classifier.model.hidden2tag(word_h_n)
    label_score = classifier.model.dropfinal(label_score)
    label_prob, label_pred = classifier.model.inference(label_score, len(i2l))
    return label_prob, label_pred
Ejemplo n.º 11
0
 def predict(self, sent, k=1):
     cl = self.args.vocab.cl            
      ## set model in eval model
     self.model.eval()
     
     fake_label = [0]        
     words = self.word2idx(sent)
     word_ids, sequence_lengths = seqPAD.pad_sequences([words], pad_tok=0, wthres=cl)
 
     data_tensors = Data2tensor.sort_tensors(fake_label, word_ids,sequence_lengths, volatile_flag=True)    
     fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
     label_score = self.model(word_tensor, sequence_lengths)
     label_prob, label_pred = self.model.inference(label_score, k)
     return label_prob, label_pred 
Ejemplo n.º 12
0
    def evaluate_batch(self, eva_data):
        with torch.no_grad():
            wl = self.args.vocab.wl
            batch_size = self.args.batch_size
            ## set model in eval model
            self.model.eval()
            start = time.time()
            y_true = Data2tensor.idx2tensor([], self.device)
            y_pred = Data2tensor.idx2tensor([], self.device)
            for i, (words, asp_locs, label_ids) in enumerate(
                    self.args.vocab.minibatches(eva_data,
                                                batch_size=batch_size)):
                word_ids, sequence_lengths = seqPAD.pad_sequences(words,
                                                                  pad_tok=0,
                                                                  wthres=wl)

                data_tensors = Data2tensor.sort_tensors(
                    label_ids, asp_locs, word_ids, sequence_lengths,
                    self.device)
                label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
                arange_tensor = Data2tensor.idx2tensor(
                    list(range(word_tensor.size(0))), self.device)

                y_true = torch.cat([y_true, label_tensor])
                label_score = self.model(word_tensor, sequence_lengths,
                                         aspect_tensor, arange_tensor)
                label_prob, label_pred = self.model.inference(label_score, k=1)

                y_pred = torch.cat([y_pred, label_pred])
#            acc = metrics.accuracy_score(y_true, y_pred)
#            print(y_pred.size())
#            print(y_true.size())
            measures = Classifier.class_metrics(y_true, y_pred.squeeze())
            end = time.time() - start
            speed = len(y_true) / end
#        print("Gradient flag: ", label_score.requires_grad)
        return measures, speed
Ejemplo n.º 13
0
    def train_batch(self, train_data, epoch=0):
        total_loss = 0.
        total_word = 0
        total_seq = 0
        start_time = time.time()
        train_batch = self.args.vocab.minibatches(
            train_data, batch_size=self.args.batch_size)
        # Turn on training mode which enables dropout.
        self.model.train()
        for batch, seq_batch in enumerate(train_batch):
            word_pad_ids, seq_lens = seqPAD.pad_sequences(
                seq_batch, pad_tok=self.args.vocab.w2i[PAD])
            seq_tensor = Data2tensor.idx2tensor(word_pad_ids, self.device)
            # seq_tensor = [batch_size, seq_len]
            total_seq += seq_tensor.size(0)
            hidden = self.model.init_hidden(seq_tensor.size(0))
            for i in range(0, seq_tensor.size(1) - 1, self.args.bptt):
                # data = [batch_size, bptt]
                # target = [batch_size, bptt]
                data, target = self.bptt_batch(seq_tensor, i)
                mask_target = target > 0
                # Starting each batch, we detach the hidden state from how it was previously produced.
                # If we didn't, the model would try backpropagating all the way to start of the dataset.
                hidden = self.repackage_hidden(hidden)
                self.model.zero_grad()
                output, hidden = self.model(data, hidden)
                loss = self.model.NLL_loss(output, target)
                loss.backward()

                # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.args.clip)
                for p in self.model.parameters():
                    p.data.add_(-self.args.lr, p.grad.data)

                total_loss += loss.item()
                total_word = total_word + mask_target.sum().item()

            cur_loss = total_loss / total_word
            elapsed = time.time() - start_time
            print('-' * 89)
            print(
                '| TRAINING | epoch {:3d} | batch {:5d} | sequences {:5d} | words {:5d} | lr {:02.2f} | '
                'words/s {:5.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                    epoch, batch + 1, total_seq, total_word, self.args.lr,
                    total_word / elapsed, cur_loss, math.exp(cur_loss)))
            print('-' * 89)
Ejemplo n.º 14
0
def wd_pred(model, vocab, sentence):
    """ Predict next word
    """
    with torch.no_grad():
        words = sentence.split(' ')
        for i, word in enumerate(words):
            # transform word to tensor
            word_idx = vocab.w2i[word]
            word_tensor = Data2tensor.idx2tensor([[word_idx]])
            if i == 0:
                hidden = model.init_hidden(word_tensor.size(0))
            output, hidden = model(word_tensor, hidden)

    label_prob, label_pred = model.inference(output)
    word_idx = label_pred.data[0][0].data.numpy()[0]

    return vocab.i2w[word_idx]
Ejemplo n.º 15
0
def rev_gen(model, vocab, start_word=SOS):
    """ Generate a review starts with 'start_word', ends with '</s>'
    """
    print('Generating sample review .....................')
    with torch.no_grad():
        word_idx = vocab.w2i[start_word]
        all_words = []
        all_words.append(start_word)
        while word_idx != vocab.w2i[EOS]:
            word_tensor = Data2tensor.idx2tensor([[word_idx]])
            hidden = model.init_hidden(word_tensor.size(0))
            output, hidden = model(word_tensor, hidden)
            label_prob, label_pred = model.inference(output)
            word_idx = label_pred.data[0][0].data.numpy()[0]
            all_words.append(vocab.i2w[word_idx])

        return ' '.join(all_words)
Ejemplo n.º 16
0
def scoring(sent, args, classifier):
    cl = args.vocab.cl
    ## set model in eval model
    classifier.model.eval()

    fake_label = [0]
    words = classifier.word2idx(sent)
    word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                      pad_tok=0,
                                                      wthres=cl)

    data_tensors = Data2tensor.sort_tensors(fake_label,
                                            word_ids,
                                            sequence_lengths,
                                            volatile_flag=True)
    fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
    label_score = classifier.model(word_tensor, sequence_lengths)
    #    label_prob, label_pred = classifier.model.inference(label_score)
    return label_score
Ejemplo n.º 17
0
 def predict_null(self, sent, asp):
     wl = self.classifier.args.vocab.wl
     ## set model in eval model
     self.classifier.model.eval()
     fake_label = [0]
     words, asp_loc = self.classifier.word2idx(sent, asp)
     word_ids, sequence_lengths = seqPAD.pad_sequences([words],
                                                       pad_tok=0,
                                                       wthres=wl)
     data_tensors = Data2tensor.sort_tensors(fake_label, [asp_loc],
                                             word_ids, sequence_lengths,
                                             self.classifier.device)
     fake_label_tensor, aspect_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors
     word_h_n = self.classifier.model.rnn.get_all_hiddens(
         word_tensor, sequence_lengths).mean(1)
     label_score = self.classifier.model.hidden2tag(word_h_n)
     label_score = self.classifier.model.dropfinal(label_score)
     label_prob, label_pred = self.classifier.model.inference(
         label_score, len(self.i2l))
     return label_prob, label_pred
Ejemplo n.º 18
0
    def predict(self, sent):
        numtags = len(self.args.vocab.l2i)
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl
        ## set model in eval model
        self.model.eval()

        words = self.word2idx(sent)
        char_ids, word_ids = zip(*words)
        fake_label = [[0] * len(word_ids)]

        char_ids, word_ids = zip(*words)
        word_ids, sequence_lengths = seqPAD.pad_sequences([word_ids],
                                                          pad_tok=0,
                                                          wthres=wl,
                                                          cthres=cl)
        char_ids, word_lengths = seqPAD.pad_sequences([char_ids],
                                                      pad_tok=0,
                                                      nlevels=2,
                                                      wthres=wl,
                                                      cthres=cl)

        data_tensors = Data2tensor.sort_tensors(fake_label,
                                                word_ids,
                                                sequence_lengths,
                                                char_ids,
                                                word_lengths,
                                                volatile_flag=True)
        fake_label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
        label_score = self.model(word_tensor, sequence_lengths, char_tensor,
                                 word_lengths, char_seq_recover)

        if numtags > 2:
            label_prob, label_pred = label_score.data.max(1)
        else:
            label_prob = F.sigmoid(label_score.squeeze())
            label_pred = (label_prob >= 0.5).data.long()
        return label_prob, label_pred
Ejemplo n.º 19
0
 def evaluate_batch(self, eva_data):
     cl = self.args.vocab.cl    
     
     batch_size = self.args.batch_size  
      ## set model in eval model
     self.model.eval()
     num_label = 0
     num_correct = 0
     for i,(words, label_ids) in enumerate(self.args.vocab.minibatches(eva_data, batch_size=batch_size)):
         word_ids, sequence_lengths = seqPAD.pad_sequences(words, pad_tok=0, wthres=cl)
         data_tensors = Data2tensor.sort_tensors(label_ids, word_ids,sequence_lengths, volatile_flag=True)
         label_tensor, word_tensor, sequence_lengths, word_seq_recover = data_tensors 
         
         label_score = self.model(word_tensor, sequence_lengths)
         label_prob, label_pred = self.model.inference(label_score, k=1)
             
         assert len(label_pred)==len(label_tensor)
         correct_pred = (label_pred.squeeze()==label_tensor.data).sum()
         assert correct_pred <=batch_size  
         num_label += len(label_tensor)
         num_correct += correct_pred
     acc = num_correct/num_label  
     return acc 
Ejemplo n.º 20
0
    def evaluate_batch(self, eva_data):
        wl = self.args.vocab.wl
        cl = self.args.vocab.cl

        batch_size = self.args.batch_size
        ## set model in eval model
        self.model.eval()
        correct_preds = 0.
        total_preds = 0.
        total_correct = 0.
        accs = []
        pred_results = []
        gold_results = []
        for i, (words, label_ids) in enumerate(
                self.args.vocab.minibatches(eva_data, batch_size=batch_size)):
            char_ids, word_ids = zip(*words)
            word_ids, sequence_lengths = seqPAD.pad_sequences(word_ids,
                                                              pad_tok=0,
                                                              wthres=wl,
                                                              cthres=cl)
            char_ids, word_lengths = seqPAD.pad_sequences(char_ids,
                                                          pad_tok=0,
                                                          nlevels=2,
                                                          wthres=wl,
                                                          cthres=cl)
            label_ids, _ = seqPAD.pad_sequences(label_ids,
                                                pad_tok=0,
                                                wthres=wl,
                                                cthres=cl)

            data_tensors = Data2tensor.sort_tensors(label_ids,
                                                    word_ids,
                                                    sequence_lengths,
                                                    char_ids,
                                                    word_lengths,
                                                    volatile_flag=True)
            label_tensor, word_tensor, sequence_lengths, word_seq_recover, char_tensor, word_lengths, char_seq_recover = data_tensors
            mask_tensor = word_tensor > 0

            label_score = self.model(word_tensor, sequence_lengths,
                                     char_tensor, word_lengths,
                                     char_seq_recover)

            label_prob, label_pred = self.model.inference(
                label_score, mask_tensor)

            pred_label, gold_label = recover_label(label_pred, label_tensor,
                                                   mask_tensor,
                                                   self.args.vocab.l2i,
                                                   word_seq_recover)
            pred_results += pred_label
            gold_results += gold_label
        acc, p, r, f = get_ner_fmeasure(gold_results, pred_results)

        #            label_pred = label_pred.cpu().data.numpy()
        #            label_tensor = label_tensor.cpu().data.numpy()
        #            sequence_lengths = sequence_lengths.cpu().data.numpy()
        #
        #            for lab, lab_pred, length in zip(label_tensor, label_pred, sequence_lengths):
        #                lab      = lab[:length]
        #                lab_pred = lab_pred[:length]
        #                accs    += [a==b for (a, b) in zip(lab, lab_pred)]
        #
        #                lab_chunks      = set(NERchunks.get_chunks(lab, self.args.vocab.l2i))
        #                lab_pred_chunks = set(NERchunks.get_chunks(lab_pred, self.args.vocab.l2i))
        #
        #                correct_preds += len(lab_chunks & lab_pred_chunks)
        #                total_preds   += len(lab_pred_chunks)
        #                total_correct += len(lab_chunks)
        #
        #        p   = correct_preds / total_preds if correct_preds > 0 else 0
        #        r   = correct_preds / total_correct if correct_preds > 0 else 0
        #        f  = 2 * p * r / (p + r) if correct_preds > 0 else 0
        #        acc = np.mean(accs)

        return acc, f
Ejemplo n.º 21
0
# coding: utf-8
import argparse
import time
import math
import torch
import os
import torch.onnx
import torch.optim as optim
from sklearn import metrics
from utils.data_utils import Vocab, Txtfile, Data2tensor, SaveloadHP, seqPAD, PAD
from utils.core_nns_emb import UniLSTMModel, BiLSTMModel

# Set the random seed manually for reproducibility.
Data2tensor.set_randseed(1234)


class Sentimentmodel(object):
    def __init__(self, args):
        self.args = args
        self.device = torch.device("cuda" if self.args.use_cuda else "cpu")
        self.word2idx = self.args.vocab.wd2idx(self.args.vocab.w2i,
                                               allow_unk=self.args.allow_unk,
                                               start_end=self.args.se_words)
        self.label2idx = self.args.vocab.tag2idx(self.args.vocab.l2i)
        self.ntokens = len(self.args.vocab.w2i)
        self.nlabels = len(self.args.vocab.l2i)
        if args.bidirect:
            self.model = BiLSTMModel(args.model,
                                     self.ntokens,
                                     args.emb_size,
                                     args.hidden_size,