def __init__(self, model_archived_file: str, cuda_device: str = "cpu"): """ model_archived_file: ends with "tar.gz" OR directly use the model folder patth """ device = torch.device(cuda_device) if model_archived_file.endswith("tar.gz"): tar = tarfile.open(model_archived_file) self.conf = pickle.load(tar.extractfile( tar.getnames()[1])) ## config file self.model = TransformersCRF(self.conf) self.model.load_state_dict( torch.load(tar.extractfile(tar.getnames()[2]), map_location=device)) ## model file else: folder_name = model_archived_file assert os.path.isdir(folder_name) f = open(folder_name + "/config.conf", 'rb') self.conf = pickle.load(f) f.close() self.model = TransformersCRF(self.conf) self.model.load_state_dict( torch.load(f"{folder_name}/lstm_crf.m", map_location=device)) self.conf.device = device self.model.to(device) self.model.eval() print( colored( f"[Data Info] Tokenizing the instances using '{self.conf.embedder_type}' tokenizer", "blue")) self.tokenizer = context_models[ self.conf.embedder_type]["tokenizer"].from_pretrained( self.conf.embedder_type)
def evaluate_model(config: Config, model: TransformersCRF, data_loader: DataLoader, name: str, insts: List, print_each_type_metric: bool = False): ## evaluation p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(), Counter() batch_size = data_loader.batch_size with torch.no_grad(): for batch_id, batch in tqdm(enumerate(data_loader, 0), desc="--evaluating batch", total=len(data_loader)): one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode(words= batch.input_ids.to(config.device), word_seq_lens = batch.word_seq_len.to(config.device), orig_to_tok_index = batch.orig_to_tok_index.to(config.device), input_mask = batch.attention_mask.to(config.device)) batch_p , batch_predict, batch_total = evaluate_batch_insts(one_batch_insts, batch_max_ids, batch.label_ids, batch.word_seq_len, config.idx2labels) p_dict += batch_p total_predict_dict += batch_predict total_entity_dict += batch_total batch_id += 1 f1Scores = [] if print_each_type_metric or config.print_detail_f1 or (config.earlystop_atr == "macro"): for key in total_entity_dict: precision_key, recall_key, fscore_key = get_metric(p_dict[key], total_entity_dict[key], total_predict_dict[key]) print(f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}") f1Scores.append(fscore_key) if len(f1Scores) > 0: print(f"[{name} set Total] Macro F1: {sum(f1Scores) / len(f1Scores):.2f}") total_p = sum(list(p_dict.values())) total_predict = sum(list(total_predict_dict.values())) total_entity = sum(list(total_entity_dict.values())) precision, recall, fscore = get_metric(total_p, total_entity, total_predict) print(colored(f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, Micro F1: {fscore:.2f}", 'blue'), flush=True) if config.earlystop_atr == "macro" and len(f1Scores) > 0: fscore = sum(f1Scores) / len(f1Scores) return [precision, recall, fscore]
class TransformersNERPredictor: def __init__(self, model_archived_file:str, cuda_device: str = "cpu"): """ model_archived_file: ends with "tar.gz" OR directly use the model folder patth """ device = torch.device(cuda_device) if model_archived_file.endswith("tar.gz"): tar = tarfile.open(model_archived_file) self.conf = pickle.load(tar.extractfile(tar.getnames()[1])) ## config file self.model = TransformersCRF(self.conf) self.model.load_state_dict(torch.load(tar.extractfile(tar.getnames()[2]), map_location=device)) ## model file else: folder_name = model_archived_file assert os.path.isdir(folder_name) f = open(folder_name + "/config.conf", 'rb') self.conf = pickle.load(f) f.close() self.model = TransformersCRF(self.conf) self.model.load_state_dict(torch.load(f"{folder_name}/lstm_crf.m", map_location=device)) self.conf.device = device self.model.to(device) self.model.eval() print(colored(f"[Data Info] Tokenizing the instances using '{self.conf.embedder_type}' tokenizer", "blue")) self.tokenizer = context_models[self.conf.embedder_type]["tokenizer"].from_pretrained(self.conf.embedder_type) def predict(self, sents: List[List[str]], batch_size = -1): batch_size = len(sents) if batch_size == -1 else batch_size dataset = TransformersNERDataset(file=None, sents=sents, tokenizer=self.tokenizer, label2idx=self.conf.label2idx, is_train=False) loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn) all_predictions = [] for batch_id, batch in tqdm(enumerate(loader, 0), desc="--evaluating batch", total=len(loader)): one_batch_insts = dataset.insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = self.model.decode(words= batch.input_ids.to(self.conf.device), word_seq_lens = batch.word_seq_len.to(self.conf.device), orig_to_tok_index = batch.orig_to_tok_index.to(self.conf.device), input_mask = batch.attention_mask.to(self.conf.device)) for idx in range(len(batch_max_ids)): length = batch.word_seq_len[idx] prediction = batch_max_ids[idx][:length].tolist() prediction = prediction[::-1] prediction = [self.conf.idx2labels[l] for l in prediction] one_batch_insts[idx].prediction = prediction all_predictions.append(prediction) return all_predictions
def main(): parser = argparse.ArgumentParser(description="LSTM CRF implementation") opt = parse_arguments(parser) if opt.mode == "train": conf = Config(opt) set_seed(opt, conf.seed) print(colored(f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer", "blue")) tokenizer = context_models[conf.embedder_type]["tokenizer"].from_pretrained(conf.embedder_type, add_prefix_space=True) print(colored(f"[Data Info] Reading dataset from: \n{conf.train_file}\n{conf.dev_file}\n{conf.test_file}", "blue")) train_dataset = TransformersNERDataset(conf.train_file, tokenizer, number=conf.train_num, is_train=True) conf.label2idx = train_dataset.label2idx conf.idx2labels = train_dataset.idx2labels dev_dataset = TransformersNERDataset(conf.dev_file, tokenizer, number=conf.dev_num, label2idx=train_dataset.label2idx, is_train=False) test_dataset = TransformersNERDataset(conf.test_file, tokenizer, number=conf.test_num, label2idx=train_dataset.label2idx, is_train=False) num_workers = 8 conf.label_size = len(train_dataset.label2idx) train_dataloader = DataLoader(train_dataset, batch_size=conf.batch_size, shuffle=True, num_workers=num_workers, collate_fn=train_dataset.collate_fn) dev_dataloader = DataLoader(dev_dataset, batch_size=conf.batch_size, shuffle=False, num_workers=num_workers, collate_fn=dev_dataset.collate_fn) test_dataloader = DataLoader(test_dataset, batch_size=conf.batch_size, shuffle=False, num_workers=num_workers, collate_fn=test_dataset.collate_fn) train_model(conf, conf.num_epochs, train_dataloader, dev_dataloader, test_dataloader) else: folder_name = f"model_files/{opt.model_folder}" device = torch.device(opt.device) assert os.path.isdir(folder_name) f = open(folder_name + "/config.conf", 'rb') saved_config = pickle.load(f) # we use `label2idx` from old config, but test file, test number f.close() print(colored(f"[Data Info] Tokenizing the instances using '{saved_config.embedder_type}' tokenizer", "blue")) tokenizer = context_models[saved_config.embedder_type]["tokenizer"].from_pretrained(saved_config.embedder_type, add_prefix_space=True) test_dataset = TransformersNERDataset(opt.test_file, tokenizer, number=opt.test_num, label2idx=saved_config.label2idx, is_train=False) test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=1, collate_fn=test_dataset.collate_fn) model = TransformersCRF(saved_config) model.load_state_dict(torch.load(f"{folder_name}/lstm_crf.m", map_location=device)) model.eval() evaluate_model(config=saved_config, model=model, data_loader=test_dataloader, name="test mode", insts = test_dataset.insts, print_each_type_metric=False)
def train_model(config: Config, epoch: int, train_loader: DataLoader, dev_loader: DataLoader, test_loader: DataLoader): ### Data Processing Info train_num = len(train_loader) print(f"[Data Info] number of training instances: {train_num}") print( colored(f"[Model Info]: Working with transformers package from huggingface with {config.embedder_type}", 'red')) print(colored(f"[Optimizer Info]: You should be aware that you are using the optimizer from huggingface.", 'red')) print(colored(f"[Optimizer Info]: Change the optimier in transformers_util.py if you want to make some modifications.", 'red')) model = TransformersCRF(config) optimizer, scheduler = get_huggingface_optimizer_and_scheduler(config, model, num_training_steps=len(train_loader) * epoch, weight_decay=0.0, eps = 1e-8, warmup_step=0) print(colored(f"[Optimizer Info] Modify the optimizer info as you need.", 'red')) print(optimizer) model.to(config.device) best_dev = [-1, 0] best_test = [-1, 0] model_folder = config.model_folder res_folder = "results" if os.path.exists("model_files/" + model_folder): raise FileExistsError( f"The folder model_files/{model_folder} exists. Please either delete it or create a new one " f"to avoid override.") model_path = f"model_files/{model_folder}/lstm_crf.m" config_path = f"model_files/{model_folder}/config.conf" res_path = f"{res_folder}/{model_folder}.results" print("[Info] The model will be saved to: %s.tar.gz" % (model_folder)) os.makedirs(f"model_files/{model_folder}", exist_ok= True) ## create model files. not raise error if exist os.makedirs(res_folder, exist_ok=True) no_incre_dev = 0 print(colored(f"[Train Info] Start training, you have set to stop if performace not increase for {config.max_no_incre} epochs",'red')) for i in tqdm(range(1, epoch + 1), desc="Epoch"): epoch_loss = 0 start_time = time.time() model.zero_grad() model.train() for iter, batch in tqdm(enumerate(train_loader, 1), desc="--training batch", total=len(train_loader)): optimizer.zero_grad() loss = model(words = batch.input_ids.to(config.device), word_seq_lens = batch.word_seq_len.to(config.device), orig_to_tok_index = batch.orig_to_tok_index.to(config.device), input_mask = batch.attention_mask.to(config.device), labels = batch.label_ids.to(config.device)) epoch_loss += loss.item() loss.backward() if config.max_grad_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm) optimizer.step() optimizer.zero_grad() scheduler.step() model.zero_grad() end_time = time.time() print("Epoch %d: %.5f, Time is %.2fs" % (i, epoch_loss, end_time - start_time), flush=True) model.eval() dev_metrics = evaluate_model(config, model, dev_loader, "dev", dev_loader.dataset.insts) test_metrics = evaluate_model(config, model, test_loader, "test", test_loader.dataset.insts) if dev_metrics[2] > best_dev[0]: print("saving the best model...") no_incre_dev = 0 best_dev[0] = dev_metrics[2] best_dev[1] = i best_test[0] = test_metrics[2] best_test[1] = i torch.save(model.state_dict(), model_path) # Save the corresponding config as well. f = open(config_path, 'wb') pickle.dump(config, f) f.close() write_results(res_path, test_loader.dataset.insts) else: no_incre_dev += 1 model.zero_grad() if no_incre_dev >= config.max_no_incre: print("early stop because there are %d epochs not increasing f1 on dev"%no_incre_dev) break print("Archiving the best Model...") with tarfile.open(f"model_files/{model_folder}.tar.gz", "w:gz") as tar: tar.add(f"model_files/{model_folder}", arcname=os.path.basename(model_folder)) print("Finished archiving the models") print("The best dev: %.2f" % (best_dev[0])) print("The corresponding test: %.2f" % (best_test[0])) print("Final testing.") model.load_state_dict(torch.load(model_path)) model.eval() evaluate_model(config, model, test_loader, "test", test_loader.dataset.insts) write_results(res_path, test_loader.dataset.insts)
def evaluate_model(config: Config, model: TransformersCRF, data_loader: DataLoader, name: str, insts: List, print_each_type_metric: bool = False): ## evaluation #p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(), Counter() f1_metrics = F1Measure() batch_size = data_loader.batch_size with torch.no_grad(): with tqdm(enumerate(data_loader, 0), desc="--evaluating batch", total=len(data_loader)) as teval: for batch_id, batch in teval: one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size] batch_max_scores, batch_max_ids = model.decode( words=batch.input_ids.to(config.device), word_seq_lens=batch.word_seq_len.to(config.device), orig_to_tok_index=batch.orig_to_tok_index.to( config.device), input_mask=batch.attention_mask.to(config.device)) batch_p, batch_predict, batch_total = evaluate_batch_insts( one_batch_insts, batch_max_ids, batch.label_ids, batch.word_seq_len, config.idx2labels) #p_dict += batch_p #total_predict_dict += batch_predict #total_entity_dict += batch_total f1_metrics.update(batch_p, batch_predict, batch_total) teval.set_postfix(**f1_metrics.get_metric( print_each_type_metric=False)[0]) batch_id += 1 final_metrics, final_metrics_key = f1_metrics.get_metric( print_each_type_metric) ''' if print_each_type_metric: for key in total_entity_dict: precision_key, recall_key, fscore_key = get_metric(p_dict[key], total_entity_dict[key], total_predict_dict[key]) print(f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}") ''' if final_metrics_key is not None: for key in final_metrics_key: precision_key, recall_key, fscore_key = final_metrics_key[key][ "Prec."], final_metrics_key[key]["Recl."], final_metrics_key[ key]["F1"] print( f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}" ) #total_p = sum(list(p_dict.values())) #total_predict = sum(list(total_predict_dict.values())) #total_entity = sum(list(total_entity_dict.values())) #precision, recall, fscore = get_metric(total_p, total_entity, total_predict) precision, recall, fscore = final_metrics["Prec"], final_metrics[ 'Recl'], final_metrics["F1"] print(colored( f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}", 'blue'), flush=True) return [precision, recall, fscore]