Ejemplo n.º 1
0
def predict(config=None, model=None, sent=None):
    """
    Input: raw sentences saved in config.input_file or sent
    Output: results of trigger identification saved in config.tri_id_result_file
            format: sentence ||| tag (BIO)
    """
    # load config
    if not config:
        config = Config()

    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_id_label_file)
    tagset_size = len(label_dic)
    # load trained model
    if not model:
        model = BertLstmCrf(config.bert_path,
                            tagset_size,
                            config.bert_embedding,
                            config.rnn_hidden,
                            config.rnn_layer,
                            dropout_ratio=config.dropout_ratio,
                            dropout1=config.dropout1,
                            use_cuda=config.use_cuda)
        model = load_model(model, name=config.load_path)
    if config.use_cuda:
        model.cuda()
    # begin predicting
    if (not config.input_file) and sent:
        # preprocess sent
        sent = sent.lower()
        tokens = sent.split()
        tokens = tokens[0:min(config.max_length - 2, len(tokens))]
        tokens_f = ['[CLS]'] + tokens + ['[SEP]']
        input_ids = torch.LongTensor([[
            int(vocab[i]) if i in vocab else int(vocab['[UNK]'])
            for i in tokens_f
        ]])
        input_masks = torch.LongTensor([[1] * len(input_ids[0])])
        if config.use_cuda and torch.cuda.is_available():
            input_ids, input_masks = input_ids.cuda(), input_masks.cuda()
        # predict tags
        with torch.no_grad():
            feats = model(input_ids, input_masks)
            path_score, best_path = model.crf(feats, input_masks)
        pred_label = best_path[0].cpu().numpy().tolist()
        pred_label = [list(label_dic.keys())[int(x)] for x in pred_label[1:-1]]
        return pred_label
    else:
        with open(config.input_file, 'r', encoding='utf-8') as f:
            sents = f.readlines()
        data = []
        for line in sents:
            line = line.lower()
            tokens = line.split()
            tokens = tokens[0:min(config.max_length - 2, len(tokens))]
            tokens_f = ['[CLS]'] + tokens + ['[SEP]']
            input_ids = [
                int(vocab[i]) if i in vocab else int(vocab['[UNK]'])
                for i in tokens_f
            ]
            input_masks = [1] * len(input_ids)
            while len(input_ids) < config.max_length:
                input_ids.append(0)
                input_masks.append(0)
            data.append((input_ids, input_masks))
        ids = torch.LongTensor([temp[0] for temp in data])
        masks = torch.LongTensor([temp[1] for temp in data])
        dataset = TensorDataset(ids, masks)
        loader = DataLoader(dataset,
                            shuffle=False,
                            batch_size=config.batch_size)
        sents = []
        pred = []
        for i, batch in tqdm.tqdm(enumerate(loader)):
            inputs, masks = batch
            inputs, masks = Variable(inputs), Variable(masks)
            masks = masks.bool()

            # save sentences
            for idx in range(inputs.shape[0]):
                sents.append(inputs[idx][masks[idx]].cpu().numpy().tolist())

            # predict labels
            if config.use_cuda:
                inputs, masks = inputs.cuda(), masks.cuda()
            with torch.no_grad():
                feats = model(inputs, masks)
                path_score, best_path = model.crf(feats, masks.byte())

            # save labels
            for idx in range(inputs.shape[0]):
                pred.append(best_path[idx][masks[idx]].cpu().numpy().tolist())
        # save result
        save_results(sents, pred, config)
        return pred
Ejemplo n.º 2
0
          accuracy)
    print('eval  epoch: {}|  loss: {}'.format(epoch, eval_loss / length))
    model.train()
    return eval_loss


if __name__ == '__main__':
    parser = argparse.ArgumentParser()
    parser.add_argument('--do-train',
                        type=bool,
                        default=False,
                        help='Whether to retrain the model.')
    parser.add_argument('--do-eval',
                        type=bool,
                        default=False,
                        help='Whether to perform evaluation.')
    parser.add_argument('--with-golden-trigger',
                        type=bool,
                        default=False,
                        help='Whether to evaluate with golden triggers.')
    args = parser.parse_args()
    if args.do_train:
        train()
    if args.do_eval:
        if not args.with_golden_trigger:
            predict()
        else:
            config = Config()
            config.tri_id_result_file = './data/tri_id_test.txt'
            config.gold_trigger_file = './data/golden_test.txt'
            predict(config)
Ejemplo n.º 3
0
def train(config=None):
    """Train Model"""
    # load config
    if not config:
        config = Config()
    print('settings:\n', config)
    # load corpus
    print('loading corpus.')
    vocab = load_vocab(config.vocab)
    label_dic = load_vocab(config.tri_id_label_file)
    tagset_size = len(label_dic)
    # load train and dev dataset
    train_data = read_corpus_tr_id(config.tri_id_train_file,
                                   max_length=config.max_length,
                                   label_dic=label_dic,
                                   vocab=vocab)
    train_ids = torch.LongTensor([temp[0] for temp in train_data])
    train_masks = torch.LongTensor([temp[1] for temp in train_data])
    train_tags = torch.LongTensor([temp[2] for temp in train_data])
    train_dataset = TensorDataset(train_ids, train_masks, train_tags)
    train_loader = DataLoader(train_dataset,
                              shuffle=True,
                              batch_size=config.batch_size)

    dev_data = read_corpus_tr_id(config.tri_id_dev_file,
                                 max_length=config.max_length,
                                 label_dic=label_dic,
                                 vocab=vocab)
    dev_ids = torch.LongTensor([temp[0] for temp in dev_data])
    dev_masks = torch.LongTensor([temp[1] for temp in dev_data])
    dev_tags = torch.LongTensor([temp[2] for temp in dev_data])
    dev_dataset = TensorDataset(dev_ids, dev_masks, dev_tags)
    dev_loader = DataLoader(dev_dataset,
                            shuffle=True,
                            batch_size=config.batch_size)
    # init model
    model = BertLstmCrf(config.bert_path,
                        tagset_size,
                        config.bert_embedding,
                        config.rnn_hidden,
                        config.rnn_layer,
                        dropout_ratio=config.dropout_ratio,
                        dropout1=config.dropout1,
                        use_cuda=config.use_cuda)
    if config.load_model:
        assert config.load_path is not None
        model = load_model(model, name=config.load_path)
    if config.use_cuda:
        model.cuda()
    # train model
    print('begin training.')
    model.train()
    optimizer = getattr(optim, config.optim)
    optimizer = optimizer(model.parameters(),
                          lr=config.lr,
                          weight_decay=config.weight_decay)
    eval_loss = 10000
    for epoch in tqdm.tqdm(range(config.base_epoch)):
        for i, batch in tqdm.tqdm(enumerate(train_loader)):
            model.zero_grad()
            inputs, masks, tags = batch
            inputs, masks, tags = Variable(inputs), Variable(masks), Variable(
                tags)
            masks = masks.bool()
            if config.use_cuda:
                inputs, masks, tags = inputs.cuda(), masks.cuda(), tags.cuda()
            feats = model(inputs, masks)
            loss = model.loss(feats, masks, tags)
            loss.backward()
            optimizer.step()
        # save best model
        dev_loss_temp = evaluate(model, dev_loader, epoch, config)
        if dev_loss_temp < eval_loss:
            print('dev loss: ', eval_loss, ' -> ', dev_loss_temp)
            eval_loss = dev_loss_temp
            save_model(model, epoch)
    return model