def test(): """Test Model in test file""" # load config config = Config() print('settings:\n', config) # load corpus print('loading corpus') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_cls_label_file) # load train and dev and test dataset test_data = read_corpus_tri_cls(config.tri_cls_test_file, max_length=config.max_length, vocab=vocab) test_ids = torch.LongTensor([temp[0] for temp in test_data]) test_masks = torch.LongTensor([temp[1] for temp in test_data]) test_types = torch.LongTensor([temp[2] for temp in test_data]) test_tags = torch.LongTensor([temp[3] for temp in test_data]) test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=config.batch_size) # init model model = BertQA(config.bert_path, 2) model = load_model(model, name=config.load_tri_cls_path) if config.use_cuda and torch.cuda.is_available(): model.cuda() # test model evaluate(model, test_loader, 0, config)
config = Config() # data preprocess ace_preprocess() # load model label_dic = load_vocab(config.tri_id_label_file) tagset_size = len(label_dic) model_id = BertLstmCrf(config.bert_path, tagset_size, config.bert_embedding, config.rnn_hidden, config.rnn_layer, dropout_ratio=config.dropout_ratio, dropout1=config.dropout1, use_cuda=config.use_cuda) model_id = load_model(model_id, name=config.load_path) model_cls = BertQA(config.bert_path, 2) model_cls = load_model(model_cls, name=config.load_tri_cls_path) # predict if not config.input_file: while True: # get input sent = input() if sent == 'exit': break # trigger identification pred_label = tri_id_pre(config, model_id, sent) for label in pred_label: print(label, end=' ') print() # trigger classification input_cls = sent + '|||' + ' '.join(pred_label)
def init_all(config, gpu_list, checkpoint, mode, *args, **params): result = {} logger.info("Begin to initialize dataset and formatter...") if mode == "train": init_formatter(config, ["train", "valid"], *args, **params) result["train_dataset"], result["valid_dataset"] = init_dataset( config, *args, **params) else: init_formatter(config, ["test"], *args, **params) result["test_dataset"] = init_test_dataset(config, *args, **params) logger.info("Begin to initialize models...") model = BertQA(config, gpu_list, *args, **params) optimizer = init_optimizer(model, config, *args, **params) trained_epoch = -1 global_step = 0 if len(gpu_list) > 0: model = model.cuda() try: model.init_multi_gpu(gpu_list, config, *args, **params) except Exception as e: logger.warning( "No init_multi_gpu implemented in the model, use single gpu instead." ) try: parameters = torch.load(checkpoint) model.load_state_dict(parameters["model"]) if mode == "train": trained_epoch = parameters["trained_epoch"] if config.get("train", "optimizer") == parameters["optimizer_name"]: optimizer.load_state_dict(parameters["optimizer"]) else: logger.warning( "Optimizer changed, do not load parameters of optimizer.") if "global_step" in parameters: global_step = parameters["global_step"] except Exception as e: information = "Cannot load checkpoint file with error %s" % str(e) if mode == "test": logger.error(information) raise e else: logger.warning(information) result["model"] = model if mode == "train": result["optimizer"] = optimizer result["trained_epoch"] = trained_epoch result["output_function"] = basic_output_function result["global_step"] = global_step logger.info("Initialize done.") return result
def train(): """Train Model""" # load config config = Config() print('settings:\n', config) # load corpus print('loading corpus') vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_cls_label_file) # load train and dev and test dataset train_data = read_corpus_tri_cls(config.tri_cls_train_file, max_length=config.max_length, vocab=vocab) train_ids = torch.LongTensor([temp[0] for temp in train_data]) train_masks = torch.LongTensor([temp[1] for temp in train_data]) train_types = torch.LongTensor([temp[2] for temp in train_data]) train_tags = torch.LongTensor([temp[3] for temp in train_data]) train_dataset = TensorDataset(train_ids, train_masks, train_types, train_tags) train_loader = DataLoader(train_dataset, shuffle=True, batch_size=config.batch_size) dev_data = read_corpus_tri_cls(config.tri_cls_dev_file, max_length=config.max_length, vocab=vocab) dev_ids = torch.LongTensor([temp[0] for temp in dev_data]) dev_masks = torch.LongTensor([temp[1] for temp in dev_data]) dev_types = torch.LongTensor([temp[2] for temp in dev_data]) dev_tags = torch.LongTensor([temp[3] for temp in dev_data]) dev_dataset = TensorDataset(dev_ids, dev_masks, dev_types, dev_tags) dev_loader = DataLoader(dev_dataset, shuffle=True, batch_size=config.batch_size) test_data = read_corpus_tri_cls(config.tri_cls_test_file, max_length=config.max_length, vocab=vocab) test_ids = torch.LongTensor([temp[0] for temp in test_data]) test_masks = torch.LongTensor([temp[1] for temp in test_data]) test_types = torch.LongTensor([temp[2] for temp in test_data]) test_tags = torch.LongTensor([temp[3] for temp in test_data]) test_dataset = TensorDataset(test_ids, test_masks, test_types, test_tags) test_loader = DataLoader(test_dataset, shuffle=False, batch_size=config.batch_size) # init model model = BertQA(config.bert_path, 2) if config.load_model: assert config.load_path is not None model = load_model(model, name=config.load_tri_cls_path) if config.use_cuda and torch.cuda.is_available(): model.cuda() # train model print('begin training') model.train() optimizer = getattr(optim, config.optim) optimizer = optimizer(model.parameters(), lr=config.lr, weight_decay=config.weight_decay) eval_loss = 10000 for epoch in tqdm.tqdm(range(config.base_epoch)): step = 0 bar = tqdm.tqdm(enumerate(train_loader)) for i, batch in bar: step += 1 model.zero_grad() inputs, masks, type_masks, label = batch inputs, masks, type_masks, label = Variable(inputs), Variable( masks), Variable(type_masks), Variable(label) masks = masks.bool() if config.use_cuda and torch.cuda.is_available(): inputs, masks, type_masks, label = inputs.cuda(), masks.cuda( ), type_masks.cuda(), label.cuda() feats = model(inputs, masks, type_masks) loss = model.loss(feats, label) loss.backward() optimizer.step() tqdm.tqdm.set_description(bar, desc="loss: %f" % loss.item()) # save best model dev_loss_temp = evaluate(model, dev_loader, epoch, config) if dev_loss_temp < eval_loss: print('dev loss: ', eval_loss, ' -> ', dev_loss_temp) eval_loss = dev_loss_temp save_model(model, epoch, name='tri-cls--epoch:{}'.format(epoch)) evaluate(model, test_loader, epoch, config)
def predict(config=None, model=None, sent=None): """ Input: results of trigger identification saved in config.tri_id_result_file or single sentence Output: results of trigger classification format: [(event type, trigger begin pos, trigger end pos) * num of triggers] * num of sentences """ # load config if not config: config = Config() # load corpus vocab = load_vocab(config.vocab) label_dic = load_vocab(config.tri_cls_label_file) # load trained model if not model: model = BertQA(config.bert_path, 2) model = load_model(model, name=config.load_tri_cls_path) if config.use_cuda: model.cuda() if (not config.input_file) and sent: test_datas = read_corpus_tr_id(config.tri_id_result_file, max_length=config.max_length, label_dic=load_vocab( config.tri_id_label_file), vocab=vocab, content=[sent]) else: # load trigger identification result test_datas = read_corpus_tr_id(config.tri_id_result_file, max_length=config.max_length, label_dic=load_vocab( config.tri_id_label_file), vocab=vocab) sent_saves = [] for i, test_data in tqdm.tqdm(enumerate(test_datas)): sent_save = [] # save sentence’s triggers sent = test_data[3] triggers = test_data[4] # predict event type for each trigger for trigger in triggers: inputs, masks, type_masks = [], [], [] # data preprocess, [CLS] trigger, [unused0] begin_pos end_pos [unused1], event type [SEP] sentence [SEP] for event_type in label_dic.keys(): tokens_a = [] for w in sent[trigger['begin_pos']:trigger['end_pos'] + 1]: tokens_a.append(w.lower()) tokens_a.extend([ ',', '[unused0]', str(trigger['begin_pos']), str(trigger['end_pos']), '[unused1]', ',' ]) for w in re.split('([:-])', event_type): tokens_a.append(w.lower()) tokens_b = sent if len(tokens_a) + len(tokens_b) > config.max_length - 3: tokens_b = tokens_b[0:(config.max_length - 3 - len(tokens_a))] tokens_f = ['[CLS]'] + tokens_a + ['[SEP]' ] + tokens_b + ['[SEP]'] input_ids = [ int(vocab[i]) if i in vocab else int(vocab['[UNK]']) for i in tokens_f ] input_mask = [1] * len(input_ids) type_mask = [0] * (2 + len(tokens_a)) + [1] * ( config.max_length - 2 - len(tokens_a)) while len(input_ids) < config.max_length: input_ids.append(0) input_mask.append(0) inputs.append(input_ids) masks.append(input_mask) type_masks.append(type_mask) inputs, masks, type_masks = Variable(torch.LongTensor(inputs)), \ Variable(torch.LongTensor(masks)), \ Variable(torch.LongTensor(type_masks)) masks = masks.bool() if config.use_cuda and torch.cuda.is_available(): inputs, masks, type_masks = inputs.cuda(), masks.cuda( ), type_masks.cuda() # predict event type with torch.no_grad(): feats_1 = model(inputs[:config.batch_size], masks[:config.batch_size], type_masks[:config.batch_size]) feats_2 = model(inputs[config.batch_size:], masks[config.batch_size:], type_masks[config.batch_size:]) feats = torch.cat([feats_1, feats_2]) tag_score = torch.nn.functional.softmax(feats) pred_label = torch.argmax(tag_score, dim=0).cpu().numpy().tolist()[1] pred_label = list(label_dic.keys())[int(pred_label)] # save event type for trigger sent_save.append( (pred_label, trigger['begin_pos'], trigger['end_pos'])) sent_saves.append(sent_save) # save result with open(config.tri_cls_result_file, 'w', encoding='utf-8') as f: for sent_save in sent_saves: for trigger in sent_save: event_type = trigger[0] begin_pos = trigger[1] end_pos = trigger[2] f.write(event_type + ' ' + str(begin_pos) + ' ' + str(end_pos) + ', ') f.write('\n') # evaluate if config.gold_trigger_file: evaluate_cls(config.gold_trigger_file) return sent_saves