class predict_w(predict):
    """prediction class for word level model (LSTM-CRF)

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        l_map: dictionary for labels
        pad_word: word padding
        pad_label: label padding
        start_label: start label 
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
    def __init__(self,
                 if_cuda,
                 f_map,
                 l_map,
                 pad_word,
                 pad_label,
                 start_label,
                 batch_size=1):
        predict.__init__(self, if_cuda, l_map, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.f_map = f_map
        self.l_map = l_map

    def apply_model(self, ner_model, features):
        """
        apply_model function for LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        features = encode_safe(features, self.f_map, self.f_map['<unk>'])
        f_len = max(map(lambda t: len(t) + 1, features))

        masks = torch.ByteTensor(
            list(
                map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1),
                    features)))
        word_features = torch.LongTensor(
            list(
                map(lambda t: t + [self.pad_word] * (f_len - len(t)),
                    features)))

        if self.if_cuda:
            fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            fea_v = autograd.Variable(word_features.transpose(0, 1))
            mask_v = masks.transpose(0, 1).contiguous()

        scores, _ = ner_model(fea_v)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args:
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """

    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader, elmo_embeddings):
        """
        calculate score for pre-selected metrics

        args:
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, origin_w_f, index in itertools.chain.from_iterable(dataset_loader):
            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)

            index = index.data.tolist()
            elmo_emb = []
            max_len = 0
            padded_elmo = []
            for i in index:
                elmo_emb.append(elmo_embeddings[i])
                if len(elmo_embeddings[i]) > max_len:
                    max_len = len(elmo_embeddings[i])
            max_len += 1
            for w in elmo_emb:
                if len(w) < max_len:
                    w = np.concatenate((w, [[0.]*1024]*(max_len - len(w))), axis=0)
                padded_elmo.append(w)
            elmo_emb = torch.FloatTensor(padded_elmo).data.permute(1, 0, 2)

            scores = ner_model(f_f, f_p, b_f, b_p, w_f, elmo_emb.cuda())
            decoded = self.decoder.decode(scores.data, mask_v.data)
            self.eval_b(decoded, tg)

        return self.calc_s()
Beispiel #3
0
class predict_w(predict):
    """prediction class for word level model (LSTM-CRF)

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        l_map: dictionary for labels
        pad_word: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
   
    def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.f_map = f_map
        self.l_map = l_map
        self.caseless = caseless
        
    def apply_model(self, ner_model, features):
        """
        apply_model function for LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        if self.caseless:
            features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features))
        features = encode_safe(features, self.f_map, self.f_map['<unk>'])
        f_len = max(map(lambda t: len(t) + 1, features))

        masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features)))
        word_features = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (f_len - len(t)), features)))

        if self.if_cuda:
            fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            fea_v = autograd.Variable(word_features.transpose(0, 1))
            mask_v = masks.transpose(0, 1).contiguous()

        scores, _ = ner_model(fea_v)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded
Beispiel #4
0
class predict_w(predict):
    """prediction class for word level model (LSTM-CRF)

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        l_map: dictionary for labels
        pad_word: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
   
    def __init__(self, if_cuda, f_map, l_map, pad_word, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True, keep_iobes=False):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size, keep_iobes=keep_iobes)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.f_map = f_map
        self.l_map = l_map
        self.caseless = caseless
        
    def apply_model(self, ner_model, features):
        """
        apply_model function for LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        if self.caseless:
            features = list(map(lambda t: list(map(lambda x: x.lower(), t)), features))
        features = encode_safe(features, self.f_map, self.f_map['<unk>'])
        f_len = max(map(lambda t: len(t) + 1, features))

        masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (f_len - len(t) - 1), features)))
        word_features = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (f_len - len(t)), features)))

        if self.if_cuda:
            fea_v = autograd.Variable(word_features.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            fea_v = autograd.Variable(word_features.transpose(0, 1))
            mask_v = masks.transpose(0, 1).contiguous()

        scores, _ = ner_model(fea_v)
        decoded = self.decoder.decode(scores.data, mask_v)
        return decoded
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args: 
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(
                dataset_loader):
            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(
                f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f)
            score = scores[-1]
            #print(score.data.size(), mask_v.data.size())
            decoded = self.decoder.decode(score.data, mask_v.data)
            self.eval_b(decoded, tg)

        return self.calc_s()
Beispiel #6
0
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args:
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """

    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader):
        """
        calculate score for pre-selected metrics

        args:
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(dataset_loader):
            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f)
            decoded = self.decoder.decode(scores.data, mask_v.data)
            self.eval_b(decoded, tg)

        return self.calc_s()
Beispiel #7
0
class predict_wc(predict):
    """prediction class for LM-LSTM-CRF

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        c_map: dictionary for chars
        l_map: dictionary for labels
        pad_word: word padding
        pad_char: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
   
    def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.pad_char = pad_char
        self.f_map = f_map
        self.c_map = c_map
        self.l_map = l_map
        self.caseless = caseless
        
    def apply_model(self, ner_model, features):
        """
        apply_model function for LM-LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        char_features = encode2char_safe(features, self.c_map)

        if self.caseless:
            word_features = encode_safe(list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['<unk>'])
        else:
            word_features = encode_safe(features, self.f_map, self.f_map['<unk>'])

        fea_len = [list( map( lambda t: len(t) + 1, f) ) for f in char_features]
        forw_features = concatChar(char_features, self.c_map)

        word_len = max(map(lambda t: len(t) + 1, word_features))
        char_len = max(map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features)))
        forw_t = list( map( lambda t: t + [self.pad_char] * ( char_len - len(t) ), forw_features ) )
        back_t = torch.LongTensor( list( map( lambda t: t[::-1], forw_t ) ) )
        forw_t = torch.LongTensor( forw_t )
        forw_p = torch.LongTensor( list( map( lambda t: list(itertools.accumulate( t + [1] * (word_len - len(t) ) ) ), fea_len) ) )
        back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [ char_len - 1 - tup for tup in t[:-1] ], forw_p) ) )

        masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features)))
        word_t = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features)))

        if self.if_cuda:
            f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda()
            f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda()
            b_f = autograd.Variable(back_t.transpose(0, 1)).cuda()
            b_p = autograd.Variable(back_p.transpose(0, 1)).cuda()
            w_f = autograd.Variable(word_t.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            f_f = autograd.Variable(forw_t.transpose(0, 1))
            f_p = autograd.Variable(forw_p.transpose(0, 1))
            b_f = autograd.Variable(back_t.transpose(0, 1))
            b_p = autograd.Variable(back_p.transpose(0, 1))
            w_f = autograd.Variable(word_t.transpose(0, 1))
            mask_v = masks.transpose(0, 1)

        scores = ner_model(f_f, f_p, b_f, b_p, w_f)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded
Beispiel #8
0
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args:
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, c_map, l_map, removed_label=[]):
        eval_batch.__init__(self, packer, c_map, l_map, removed_label)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        self.eval_b = self.calc_f1_batch
        self.calc_s = self.f1_score

    def calc_score(self,
                   ner_model,
                   dataset_loader,
                   out,
                   f_map,
                   emb,
                   word_to_id,
                   gpu,
                   r_c_map,
                   l_map,
                   knowledge_dict,
                   no_dict=False):
        """
        calculate score for pre-selected metrics

        args:
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(
                dataset_loader):
            mask_v = mask_v.bool()
            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(
                f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
            w_f_word = utils.reconstruct_word_input(w_f, f_map, emb,
                                                    word_to_id, gpu)
            prior_prob = utils.generate_prior_prob(r_c_map, l_map, f_f,
                                                   knowledge_dict)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f, w_f_word, prior_prob)
            decoded = self.decoder.decode(scores.data, mask_v.data, prior_prob,
                                          no_dict)
            self.eval_b(decoded, tg)

        return self.calc_s()

    def check_output(self,
                     ner_model,
                     dataset_loader,
                     out,
                     f_map,
                     emb,
                     word_to_id,
                     gpu,
                     knowledge_dict,
                     no_dict=False):
        ner_model.eval()
        self.reset()
        f = open('model_output.txt', 'w')
        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v in itertools.chain.from_iterable(
                dataset_loader):
            mask_v = mask_v.bool()

            f_f, f_p, b_f, b_p, w_f, _, mask_v = self.packer.repack_vb(
                f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v)
            w_f_word = utils.reconstruct_word_input(w_f, f_map, emb,
                                                    word_to_id, gpu)
            prior_prob = utils.generate_prior_prob(self.r_c_map, self.l_map,
                                                   f_f, knowledge_dict)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f, w_f_word, prior_prob)
            decoded = self.decoder.decode(scores.data, mask_v.data, prior_prob,
                                          no_dict)

            self.eval_b(decoded, tg)
            batch_decoded = torch.unbind(decoded, 1)
            batch_targets = torch.unbind(tg, 0)
            batch_f_f = torch.unbind(f_f, 1)

            for decoded, target, character in zip(batch_decoded, batch_targets,
                                                  batch_f_f):

                gold = self.packer.convert_for_eval(target)
                # remove padding
                length = utils.find_length_from_labels(gold, self.l_map)
                gold = gold[:length].numpy()
                best_path = decoded[:length].numpy()
                character_filted = []
                for c in character.cpu().numpy():
                    if c != 42:
                        character_filted.append(c)
                char = character_filted[:length]
                for i in range(len(gold)):

                    f.write(self.r_c_map[char[i]] + ' ' +
                            self.r_l_map[gold[i]] + ' ' +
                            self.r_l_map[best_path[i]])
                    f.write('\n')

                f.write('\n')
        f.close()
        return self.calc_s()
Beispiel #9
0
class eval_w(eval_batch):
    """evaluation class for word level model (LSTM-CRF)

    args:
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self, ner_model, dataset_loader, illegal_idx, is_bichar):
        """
        calculate score for pre-selected metrics

        args:
            ner_model: LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()
        for i in range(len(dataset_loader[0])):
            fea_v, tg_v, mask_v, bi_fea_v = self.packer.repack_vb(
                np.asarray(dataset_loader[0][i]),
                np.asarray(dataset_loader[1][i]),
                np.asarray(dataset_loader[2][i]),
                np.asarray(dataset_loader[4][i]))
            ner_model.zero_grad()
            scores, hidden = ner_model(fea_v, bi_fea_v, dataset_loader[3][i],
                                       illegal_idx, self.l_map, is_bichar)
            decoded = self.decoder.decode(scores.data, mask_v.data)
            self.eval_b(
                decoded,
                torch.LongTensor(np.asarray(
                    dataset_loader[1][i])).unsqueeze(0))
        return self.calc_s()

    def calc_predict(self, ner_model, dataset_loader, test_features, file_out,
                     file_out_2, f_map):
        """
        calculate score for pre-selected metrics

        args:
            ner_model: LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()
        idx2label = {v: k for k, v in self.l_map.items()}
        idx2word = {v: k for k, v in f_map.items()}
        for i in range(len(dataset_loader[0])):
            fea_v, tg_v, mask_v = self.packer.repack_vb(
                np.asarray(dataset_loader[0][i]),
                np.asarray(dataset_loader[1][i]),
                np.asarray(dataset_loader[2][i]))
            ner_model.zero_grad()
            scores, hidden = ner_model(fea_v, dataset_loader[3][i])
            decoded = self.decoder.decode(scores.data, mask_v.data)
            gold = [d % len(self.l_map) for d in dataset_loader[1][i]]
            # words = [idx2word[w] for w in dataset_loader[0][i]]
            length = utils.find_length_from_labels(gold, self.l_map)
            gold = gold[:length]
            words = test_features[i][:length]
            best_path = decoded.squeeze(1).tolist()[:length]
            gold = [idx2label[g] for g in gold]
            best_path = [idx2label[g] for g in best_path]
            for i in range(length):
                file_out.write("%s %s\n" % (words[i], best_path[i]))
            file_out.write("\n")

            sent = ''
            pos = None
            word = ''
            for i in range(length):
                if best_path[i].startswith('B'):
                    if pos != None:
                        sent += word + '_' + pos + ' '
                        word = ''
                        pos = None
                    word += words[i]
                    pos = best_path[i].split('-')[1]
                else:
                    assert pos != None
                    word += words[i]
            if len(word) > 0:
                sent += word + '_' + pos + ' '
            file_out_2.write("%s\n" % (sent))
Beispiel #10
0
class predict_wc(predict):
    """prediction class for LM-LSTM-CRF

    args: 
        if_cuda: if use cuda to speed up 
        f_map: dictionary for words
        c_map: dictionary for chars
        l_map: dictionary for labels
        pad_word: word padding
        pad_char: word padding
        pad_label: label padding
        start_label: start label 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
        caseless: caseless or not
    """
   
    def __init__(self, if_cuda, f_map, c_map, l_map, pad_word, pad_char, pad_label, start_label, label_seq = True, batch_size = 50, caseless=True):
        predict.__init__(self, if_cuda, l_map, label_seq, batch_size)
        self.decoder = CRFDecode_vb(len(l_map), start_label, pad_label)
        self.pad_word = pad_word
        self.pad_char = pad_char
        self.f_map = f_map
        self.c_map = c_map
        self.l_map = l_map
        self.caseless = caseless
        
    def apply_model(self, ner_model, features):
        """
        apply_model function for LM-LSTM-CRF

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
        """
        char_features = encode2char_safe(features, self.c_map)

        if self.caseless:
            word_features = encode_safe(list(map(lambda t: list(map(lambda x: x.lower(), t)), features)), self.f_map, self.f_map['<unk>'])
        else:
            word_features = encode_safe(features, self.f_map, self.f_map['<unk>'])

        fea_len = [list( map( lambda t: len(t) + 1, f) ) for f in char_features]
        forw_features = concatChar(char_features, self.c_map)

        word_len = max(map(lambda t: len(t) + 1, word_features))
        char_len = max(map(lambda t: len(t[0]) + word_len - len(t[1]), zip(forw_features, word_features)))
        forw_t = list( map( lambda t: t + [self.pad_char] * ( char_len - len(t) ), forw_features ) )
        back_t = torch.LongTensor( list( map( lambda t: t[::-1], forw_t ) ) )
        forw_t = torch.LongTensor( forw_t )
        forw_p = torch.LongTensor( list( map( lambda t: list(itertools.accumulate( t + [1] * (word_len - len(t) ) ) ), fea_len) ) )
        back_p = torch.LongTensor( list( map( lambda t: [char_len - 1] + [ char_len - 1 - tup for tup in t[:-1] ], forw_p) ) )

        masks = torch.ByteTensor(list(map(lambda t: [1] * (len(t) + 1) + [0] * (word_len - len(t) - 1), word_features)))
        word_t = torch.LongTensor(list(map(lambda t: t + [self.pad_word] * (word_len - len(t)), word_features)))

        if self.if_cuda:
            f_f = autograd.Variable(forw_t.transpose(0, 1)).cuda()
            f_p = autograd.Variable(forw_p.transpose(0, 1)).cuda()
            b_f = autograd.Variable(back_t.transpose(0, 1)).cuda()
            b_p = autograd.Variable(back_p.transpose(0, 1)).cuda()
            w_f = autograd.Variable(word_t.transpose(0, 1)).cuda()
            mask_v = masks.transpose(0, 1).cuda()
        else:
            f_f = autograd.Variable(forw_t.transpose(0, 1))
            f_p = autograd.Variable(forw_p.transpose(0, 1))
            b_f = autograd.Variable(back_t.transpose(0, 1))
            b_p = autograd.Variable(back_p.transpose(0, 1))
            w_f = autograd.Variable(word_t.transpose(0, 1))
            mask_v = masks.transpose(0, 1)

        scores = ner_model(f_f, f_p, b_f, b_p, w_f)
        decoded = self.decoder.decode(scores.data, mask_v)

        return decoded
Beispiel #11
0
class eval_wc(eval_batch):
    """evaluation class for LM-LSTM-CRF

    args: 
        packer: provide method to convert target into original space [TODO: need to improve]
        l_map: dictionary for labels
        score_type: use f1score with using 'f'

    """
    def __init__(self, packer, l_map, score_type):
        eval_batch.__init__(self, packer, l_map)

        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'],
                                    l_map['<pad>'])

        if 'f' in score_type:
            self.eval_b = self.calc_f1_batch
            self.calc_s = self.f1_score
        else:
            self.eval_b = self.calc_acc_batch
            self.calc_s = self.acc_score

    def calc_score(self,
                   ner_model,
                   dataset_loader,
                   crf_no,
                   crit_ner,
                   verbose=0):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        self.reset()

        epoch_loss = 0
        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, corpus_mask_v, reorder in itertools.chain.from_iterable(
                dataset_loader):
            f_f, f_p, b_f, b_p, w_f, _, mask_v, corpus_mask_v = self.packer.repack_vb(
                f_f,
                f_p,
                b_f,
                b_p,
                w_f,
                tg,
                mask_v,
                len_v,
                corpus_mask_v,
                volatile=True)
            scores = ner_model(f_f, f_p, b_f, b_p, w_f, crf_no, corpus_mask_v)

            loss = crit_ner(scores, _, mask_v, corpus_mask_v)
            #print(loss)
            epoch_loss += utils.to_scalar(loss)

            decoded = self.decoder.decode(scores.data, mask_v.data)
            self.eval_b(decoded, tg)

        ###################
        # tests code
        print("validation loss: {}, {}".format(
            epoch_loss,
            epoch_loss / sum(map(lambda t: len(t), dataset_loader))))
        if verbose > 0:
            print("pred", self.pred_cnter)
            print("gold", self.gold_cnter)
        ############

        return self.calc_s()
Beispiel #12
0
class Predictor:
    """Base class for prediction, provide method to calculate f1 score and accuracy 

    args: 
        if_cuda: if use cuda to speed up 
        l_map: dictionary for labels 
        label_seq: type of decode function, set `True` to couple label with text, or set 'False' to insert label into test
        batch_size: size of batch in decoding
    """

    def __init__(self, l_map, packer, label_seq = True, batch_size = 50):
        self.l_map = l_map
        self.decoder = CRFDecode_vb(len(l_map), l_map['<start>'], l_map['<pad>'])
        self.packer = packer
        self.r_l_map = revlut(l_map)
        self.batch_size = batch_size
        if label_seq:
            self.decode_str = self.decode_l
        else:
            self.decode_str = self.decode_s

    def decode_l(self, feature, label):
        """
        decode a sentence coupled with label

        args:
            feature (list): words list
            label (list): label list
        """
        return '\n'.join(map(lambda t: t[0] + ' '+ self.r_l_map[t[1]], zip(feature, label)))

    def decode_s(self, feature, label):
        """
        decode a sentence in the format of <>

        args:
            feature (list): words list
            label (list): label list
        """
        chunks = ""
        current = None

        for f, y in zip(feature, label):
            label = self.r_l_map[y]

            if label.startswith('B-'):

                if current is not None:
                    chunks += "</"+current+"> "
                current = label[2:]
                chunks += "<"+current+"> " + f + " "

            elif label.startswith('S-'):

                if current is not None:
                    chunks += " </"+current+"> "
                current = label[2:]
                chunks += "<"+current+"> " + f + " </"+current+"> "
                current = None

            elif label.startswith('I-'):

                if current is not None:
                    base = label[2:]
                    if base == current:
                        chunks += f+" "
                    else:
                        chunks += "</"+current+"> <"+base+"> " + f + " "
                        current = base
                else:
                    current = label[2:]
                    chunks += "<"+current+"> " + f + " "

            elif label.startswith('E-'):

                if current is not None:
                    base = label[2:]
                    if base == current:
                        chunks += f + " </"+base+"> "
                        current = None
                    else:
                        chunks += "</"+current+"> <"+base+"> " + f + " </"+base+"> "
                        current = None

                else:
                    current = label[2:]
                    chunks += "<"+current+"> " + f + " </"+current+"> "
                    current = None

            else:
                if current is not None:
                    chunks += "</"+current+"> "
                chunks += f+" "
                current = None

        if current is not None:
            chunks += "</"+current+"> "

        return chunks

    def output_batch(self, ner_model, documents, fout, crf_no):
        """
        decode the whole corpus in the specific format by calling apply_model to fit specific models

        args:
            ner_model: sequence labeling model
            feature (list): list of words list
            fout: output file
        """
        ner_model.eval()

        d_len = len(documents)
        for d_ind in tqdm( range(0, d_len), mininterval=1,
                desc=' - Process', leave=False, file=sys.stdout):
            fout.write('-DOCSTART- -DOCSTART- -DOCSTART-\n\n')
            features = documents[d_ind]
            f_len = len(features)
            for ind in range(0, f_len, self.batch_size):
                eind = min(f_len, ind + self.batch_size)
                labels = self.apply_model(ner_model, features[ind: eind], file_no)
                labels = torch.unbind(labels, 1)

                for ind2 in range(ind, eind):
                    f = features[ind2]
                    l = labels[ind2 - ind][0: len(f) ]
                    fout.write(self.decode_str(features[ind2], l) + '\n\n')

    def predict(self, ner_model, dataset_loader, crf_no, pred_method, merge_batch=False, totag=False):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        ner_model.eval()
        corpus_labels = []
        num_sample = sum(map(lambda t: len(t), dataset_loader)) 
        for f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, corpus_mask_v, reorder in tqdm(
            itertools.chain.from_iterable(dataset_loader), mininterval=2,
            desc=' - Total it %d' % (num_sample), leave=False, file=sys.stdout):
            f_f, f_p, b_f, b_p, w_f, _, mask_v, corpus_mask_v = self.packer.repack_vb(f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, corpus_mask_v, volatile=True)
            labels, scores = self.predict_batch(ner_model, crf_no, f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, corpus_mask_v, pred_method)
            
            labels = torch.unbind(labels, 1)
            _, length = torch.unbind(len_v, 1)
            length = length.tolist()

            unpad_labels = []
            for pad_labels, sent_len, rerdr in zip(labels, length, reorder):
                # subtract the <start> tag
                unpad_labels.append((rerdr, pad_labels[:sent_len - 1]))
            if totag:
                unpad_labels = [(rerdr, [self.r_l_map[idx_label] for idx_label in sent_idx_label]) for rerdr, sent_idx_label in unpad_labels]
            corpus_labels.append(unpad_labels)
        if merge_batch:
            corpus_labels = functools.reduce(lambda x, y: x + y, corpus_labels)
        return corpus_labels

    def predict_batch(self, ner_model, crf_no, f_f, f_p, b_f, b_p, w_f, tg, mask_v, len_v, corpus_mask_v, pred_method):
        """
        calculate score for pre-selected metrics

        args: 
            ner_model: LM-LSTM-CRF model
            dataset_loader: loader class for test set
        """
        if ner_model.training:
            ner_model.eval()
        scores = ner_model(f_f, f_p, b_f, b_p, w_f, crf_no, corpus_mask_v)
        
        assert pred_method in ["M", "U"]
        
        if pred_method == "M":
            # no matter take sigmoid or not, setting undesired scores to -inf
            neg_inf_scores = autograd.Variable(torch.FloatTensor(np.full(scores.shape, -1e9))).cuda()
            selected_scores = utils.switch(neg_inf_scores.contiguous(), scores.contiguous(), corpus_mask_v).view(scores.shape)
            decoded = self.decoder.decode(selected_scores.data, mask_v.data)
            return decoded, scores
        
        if pred_method == "U":
            decoded = self.decoder.decode(scores.data, mask_v.data)
            for i in range(decoded.shape[0]):
                for j in range(decoded.shape[1]):
                    idx_annotated = np.where(corpus_mask_v[i,j,0].cpu().data)[0]
                    if not decoded[i,j] in idx_annotated:
                        decoded[i,j] = self.l_map['O']
            return decoded, scores