Beispiel #1
0
    def __init__(self, model_archived_file: str, cuda_device: str = "cpu"):
        """
        model_archived_file: ends with "tar.gz"
        OR
        directly use the model folder patth
        """
        device = torch.device(cuda_device)
        if model_archived_file.endswith("tar.gz"):
            tar = tarfile.open(model_archived_file)
            self.conf = pickle.load(tar.extractfile(
                tar.getnames()[1]))  ## config file
            self.model = TransformersCRF(self.conf)
            self.model.load_state_dict(
                torch.load(tar.extractfile(tar.getnames()[2]),
                           map_location=device))  ## model file
        else:
            folder_name = model_archived_file
            assert os.path.isdir(folder_name)
            f = open(folder_name + "/config.conf", 'rb')
            self.conf = pickle.load(f)
            f.close()
            self.model = TransformersCRF(self.conf)
            self.model.load_state_dict(
                torch.load(f"{folder_name}/lstm_crf.m", map_location=device))
        self.conf.device = device
        self.model.to(device)
        self.model.eval()

        print(
            colored(
                f"[Data Info] Tokenizing the instances using '{self.conf.embedder_type}' tokenizer",
                "blue"))
        self.tokenizer = context_models[
            self.conf.embedder_type]["tokenizer"].from_pretrained(
                self.conf.embedder_type)
Beispiel #2
0
def evaluate_model(config: Config, model: TransformersCRF, data_loader: DataLoader, name: str, insts: List, print_each_type_metric: bool = False):
    ## evaluation
    p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(), Counter()
    batch_size = data_loader.batch_size
    with torch.no_grad():
        for batch_id, batch in tqdm(enumerate(data_loader, 0), desc="--evaluating batch", total=len(data_loader)):
            one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) * batch_size]
            batch_max_scores, batch_max_ids = model.decode(words= batch.input_ids.to(config.device),
                    word_seq_lens = batch.word_seq_len.to(config.device),
                    orig_to_tok_index = batch.orig_to_tok_index.to(config.device),
                    input_mask = batch.attention_mask.to(config.device))
            batch_p , batch_predict, batch_total = evaluate_batch_insts(one_batch_insts, batch_max_ids, batch.label_ids, batch.word_seq_len, config.idx2labels)
            p_dict += batch_p
            total_predict_dict += batch_predict
            total_entity_dict += batch_total
            batch_id += 1
    f1Scores = []
    if print_each_type_metric or config.print_detail_f1 or (config.earlystop_atr == "macro"):
        for key in total_entity_dict:
            precision_key, recall_key, fscore_key = get_metric(p_dict[key], total_entity_dict[key], total_predict_dict[key])
            print(f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}")
            f1Scores.append(fscore_key)
        if len(f1Scores) > 0:
            print(f"[{name} set Total] Macro F1: {sum(f1Scores) / len(f1Scores):.2f}")

    total_p = sum(list(p_dict.values()))
    total_predict = sum(list(total_predict_dict.values()))
    total_entity = sum(list(total_entity_dict.values()))
    precision, recall, fscore = get_metric(total_p, total_entity, total_predict)
    print(colored(f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, Micro F1: {fscore:.2f}", 'blue'), flush=True)

    if config.earlystop_atr == "macro" and len(f1Scores) > 0:
        fscore = sum(f1Scores) / len(f1Scores)

    return [precision, recall, fscore]
Beispiel #3
0
class TransformersNERPredictor:

    def __init__(self, model_archived_file:str,
                 cuda_device: str = "cpu"):
        """
        model_archived_file: ends with "tar.gz"
        OR
        directly use the model folder patth
        """
        device = torch.device(cuda_device)
        if model_archived_file.endswith("tar.gz"):
            tar = tarfile.open(model_archived_file)
            self.conf = pickle.load(tar.extractfile(tar.getnames()[1])) ## config file
            self.model = TransformersCRF(self.conf)
            self.model.load_state_dict(torch.load(tar.extractfile(tar.getnames()[2]), map_location=device)) ## model file
        else:
            folder_name = model_archived_file
            assert os.path.isdir(folder_name)
            f = open(folder_name + "/config.conf", 'rb')
            self.conf = pickle.load(f)
            f.close()
            self.model = TransformersCRF(self.conf)
            self.model.load_state_dict(torch.load(f"{folder_name}/lstm_crf.m", map_location=device))
        self.conf.device = device
        self.model.to(device)
        self.model.eval()

        print(colored(f"[Data Info] Tokenizing the instances using '{self.conf.embedder_type}' tokenizer", "blue"))
        self.tokenizer = context_models[self.conf.embedder_type]["tokenizer"].from_pretrained(self.conf.embedder_type)

    def predict(self, sents: List[List[str]], batch_size = -1):
        batch_size = len(sents) if batch_size == -1 else batch_size

        dataset = TransformersNERDataset(file=None, sents=sents, tokenizer=self.tokenizer, label2idx=self.conf.label2idx, is_train=False)
        loader = DataLoader(dataset, batch_size=batch_size, shuffle=False, num_workers=1, collate_fn=dataset.collate_fn)

        all_predictions = []
        for batch_id, batch in tqdm(enumerate(loader, 0), desc="--evaluating batch", total=len(loader)):
            one_batch_insts = dataset.insts[batch_id * batch_size:(batch_id + 1) * batch_size]
            batch_max_scores, batch_max_ids = self.model.decode(words= batch.input_ids.to(self.conf.device),
                    word_seq_lens = batch.word_seq_len.to(self.conf.device),
                    orig_to_tok_index = batch.orig_to_tok_index.to(self.conf.device),
                    input_mask = batch.attention_mask.to(self.conf.device))

            for idx in range(len(batch_max_ids)):
                length = batch.word_seq_len[idx]
                prediction = batch_max_ids[idx][:length].tolist()
                prediction = prediction[::-1]
                prediction = [self.conf.idx2labels[l] for l in prediction]
                one_batch_insts[idx].prediction = prediction
                all_predictions.append(prediction)
        return all_predictions
Beispiel #4
0
def main():
    parser = argparse.ArgumentParser(description="LSTM CRF implementation")
    opt = parse_arguments(parser)

    if opt.mode == "train":
        conf = Config(opt)
        set_seed(opt, conf.seed)
        print(colored(f"[Data Info] Tokenizing the instances using '{conf.embedder_type}' tokenizer", "blue"))
        tokenizer = context_models[conf.embedder_type]["tokenizer"].from_pretrained(conf.embedder_type, add_prefix_space=True)
        print(colored(f"[Data Info] Reading dataset from: \n{conf.train_file}\n{conf.dev_file}\n{conf.test_file}", "blue"))
        train_dataset = TransformersNERDataset(conf.train_file, tokenizer, number=conf.train_num, is_train=True)
        conf.label2idx = train_dataset.label2idx
        conf.idx2labels = train_dataset.idx2labels

        dev_dataset = TransformersNERDataset(conf.dev_file, tokenizer, number=conf.dev_num, label2idx=train_dataset.label2idx, is_train=False)
        test_dataset = TransformersNERDataset(conf.test_file, tokenizer, number=conf.test_num, label2idx=train_dataset.label2idx, is_train=False)
        num_workers = 8
        conf.label_size = len(train_dataset.label2idx)
        train_dataloader = DataLoader(train_dataset, batch_size=conf.batch_size, shuffle=True, num_workers=num_workers,
                                      collate_fn=train_dataset.collate_fn)
        dev_dataloader = DataLoader(dev_dataset, batch_size=conf.batch_size, shuffle=False, num_workers=num_workers,
                                      collate_fn=dev_dataset.collate_fn)
        test_dataloader = DataLoader(test_dataset, batch_size=conf.batch_size, shuffle=False, num_workers=num_workers,
                                      collate_fn=test_dataset.collate_fn)

        train_model(conf, conf.num_epochs, train_dataloader, dev_dataloader, test_dataloader)
    else:
        folder_name = f"model_files/{opt.model_folder}"
        device = torch.device(opt.device)
        assert os.path.isdir(folder_name)
        f = open(folder_name + "/config.conf", 'rb')
        saved_config = pickle.load(f) # we use `label2idx` from old config, but test file, test number
        f.close()
        print(colored(f"[Data Info] Tokenizing the instances using '{saved_config.embedder_type}' tokenizer", "blue"))
        tokenizer = context_models[saved_config.embedder_type]["tokenizer"].from_pretrained(saved_config.embedder_type, add_prefix_space=True)
        test_dataset = TransformersNERDataset(opt.test_file, tokenizer, number=opt.test_num,
                                              label2idx=saved_config.label2idx, is_train=False)
        test_dataloader = DataLoader(test_dataset, batch_size=opt.batch_size, shuffle=False, num_workers=1,
                                     collate_fn=test_dataset.collate_fn)
        model = TransformersCRF(saved_config)
        model.load_state_dict(torch.load(f"{folder_name}/lstm_crf.m", map_location=device))
        model.eval()
        evaluate_model(config=saved_config, model=model, data_loader=test_dataloader, name="test mode", insts = test_dataset.insts,
                       print_each_type_metric=False)
Beispiel #5
0
def train_model(config: Config, epoch: int, train_loader: DataLoader, dev_loader: DataLoader, test_loader: DataLoader):
    ### Data Processing Info
    train_num = len(train_loader)
    print(f"[Data Info] number of training instances: {train_num}")

    print(
        colored(f"[Model Info]: Working with transformers package from huggingface with {config.embedder_type}", 'red'))
    print(colored(f"[Optimizer Info]: You should be aware that you are using the optimizer from huggingface.", 'red'))
    print(colored(f"[Optimizer Info]: Change the optimier in transformers_util.py if you want to make some modifications.", 'red'))
    model = TransformersCRF(config)
    optimizer, scheduler = get_huggingface_optimizer_and_scheduler(config, model, num_training_steps=len(train_loader) * epoch,
                                                                   weight_decay=0.0, eps = 1e-8, warmup_step=0)
    print(colored(f"[Optimizer Info] Modify the optimizer info as you need.", 'red'))
    print(optimizer)

    model.to(config.device)

    best_dev = [-1, 0]
    best_test = [-1, 0]

    model_folder = config.model_folder
    res_folder = "results"
    if os.path.exists("model_files/" + model_folder):
        raise FileExistsError(
            f"The folder model_files/{model_folder} exists. Please either delete it or create a new one "
            f"to avoid override.")
    model_path = f"model_files/{model_folder}/lstm_crf.m"
    config_path = f"model_files/{model_folder}/config.conf"
    res_path = f"{res_folder}/{model_folder}.results"
    print("[Info] The model will be saved to: %s.tar.gz" % (model_folder))
    os.makedirs(f"model_files/{model_folder}", exist_ok= True) ## create model files. not raise error if exist
    os.makedirs(res_folder, exist_ok=True)
    no_incre_dev = 0
    print(colored(f"[Train Info] Start training, you have set to stop if performace not increase for {config.max_no_incre} epochs",'red'))
    for i in tqdm(range(1, epoch + 1), desc="Epoch"):
        epoch_loss = 0
        start_time = time.time()
        model.zero_grad()
        model.train()
        for iter, batch in tqdm(enumerate(train_loader, 1), desc="--training batch", total=len(train_loader)):
            optimizer.zero_grad()
            loss = model(words = batch.input_ids.to(config.device), word_seq_lens = batch.word_seq_len.to(config.device),
                    orig_to_tok_index = batch.orig_to_tok_index.to(config.device), input_mask = batch.attention_mask.to(config.device),
                    labels = batch.label_ids.to(config.device))
            epoch_loss += loss.item()
            loss.backward()
            if config.max_grad_norm > 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.max_grad_norm)
            optimizer.step()
            optimizer.zero_grad()
            scheduler.step()
            model.zero_grad()
        end_time = time.time()
        print("Epoch %d: %.5f, Time is %.2fs" % (i, epoch_loss, end_time - start_time), flush=True)

        model.eval()
        dev_metrics = evaluate_model(config, model, dev_loader, "dev", dev_loader.dataset.insts)
        test_metrics = evaluate_model(config, model, test_loader, "test", test_loader.dataset.insts)
        if dev_metrics[2] > best_dev[0]:
            print("saving the best model...")
            no_incre_dev = 0
            best_dev[0] = dev_metrics[2]
            best_dev[1] = i
            best_test[0] = test_metrics[2]
            best_test[1] = i
            torch.save(model.state_dict(), model_path)
            # Save the corresponding config as well.
            f = open(config_path, 'wb')
            pickle.dump(config, f)
            f.close()
            write_results(res_path, test_loader.dataset.insts)
        else:
            no_incre_dev += 1
        model.zero_grad()
        if no_incre_dev >= config.max_no_incre:
            print("early stop because there are %d epochs not increasing f1 on dev"%no_incre_dev)
            break

    print("Archiving the best Model...")
    with tarfile.open(f"model_files/{model_folder}.tar.gz", "w:gz") as tar:
        tar.add(f"model_files/{model_folder}", arcname=os.path.basename(model_folder))

    print("Finished archiving the models")

    print("The best dev: %.2f" % (best_dev[0]))
    print("The corresponding test: %.2f" % (best_test[0]))
    print("Final testing.")
    model.load_state_dict(torch.load(model_path))
    model.eval()
    evaluate_model(config, model, test_loader, "test", test_loader.dataset.insts)
    write_results(res_path, test_loader.dataset.insts)
Beispiel #6
0
def evaluate_model(config: Config,
                   model: TransformersCRF,
                   data_loader: DataLoader,
                   name: str,
                   insts: List,
                   print_each_type_metric: bool = False):
    ## evaluation
    #p_dict, total_predict_dict, total_entity_dict = Counter(), Counter(), Counter()
    f1_metrics = F1Measure()
    batch_size = data_loader.batch_size
    with torch.no_grad():
        with tqdm(enumerate(data_loader, 0),
                  desc="--evaluating batch",
                  total=len(data_loader)) as teval:
            for batch_id, batch in teval:
                one_batch_insts = insts[batch_id * batch_size:(batch_id + 1) *
                                        batch_size]
                batch_max_scores, batch_max_ids = model.decode(
                    words=batch.input_ids.to(config.device),
                    word_seq_lens=batch.word_seq_len.to(config.device),
                    orig_to_tok_index=batch.orig_to_tok_index.to(
                        config.device),
                    input_mask=batch.attention_mask.to(config.device))
                batch_p, batch_predict, batch_total = evaluate_batch_insts(
                    one_batch_insts, batch_max_ids, batch.label_ids,
                    batch.word_seq_len, config.idx2labels)
                #p_dict += batch_p
                #total_predict_dict += batch_predict
                #total_entity_dict += batch_total
                f1_metrics.update(batch_p, batch_predict, batch_total)
                teval.set_postfix(**f1_metrics.get_metric(
                    print_each_type_metric=False)[0])
                batch_id += 1
    final_metrics, final_metrics_key = f1_metrics.get_metric(
        print_each_type_metric)
    '''
    if print_each_type_metric:
        for key in total_entity_dict:
            precision_key, recall_key, fscore_key = get_metric(p_dict[key], total_entity_dict[key], total_predict_dict[key])
            print(f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}")
    '''
    if final_metrics_key is not None:
        for key in final_metrics_key:
            precision_key, recall_key, fscore_key = final_metrics_key[key][
                "Prec."], final_metrics_key[key]["Recl."], final_metrics_key[
                    key]["F1"]
            print(
                f"[{key}] Prec.: {precision_key:.2f}, Rec.: {recall_key:.2f}, F1: {fscore_key:.2f}"
            )

    #total_p = sum(list(p_dict.values()))
    #total_predict = sum(list(total_predict_dict.values()))
    #total_entity = sum(list(total_entity_dict.values()))
    #precision, recall, fscore = get_metric(total_p, total_entity, total_predict)
    precision, recall, fscore = final_metrics["Prec"], final_metrics[
        'Recl'], final_metrics["F1"]
    print(colored(
        f"[{name} set Total] Prec.: {precision:.2f}, Rec.: {recall:.2f}, F1: {fscore:.2f}",
        'blue'),
          flush=True)

    return [precision, recall, fscore]