Esempio n. 1
0
class NERPredictor:
    def __init__(self,
                 model_dir,
                 batch_size,
                 epoch,
                 max_seq_length=128,
                 local_rank=-1,
                 no_cuda=False):

        self._batch_size = batch_size
        self._local_rank = local_rank
        self._max_seq_length = max_seq_length

        self._device, self._n_gpu = get_device(no_cuda=no_cuda)

        self._model_config = json.load(
            open(os.path.join(model_dir, "model_config.json"), "r"))

        self._label_to_id = self._model_config['label_map']

        self._label_map = {
            v: k
            for k, v in self._model_config['label_map'].items()
        }

        self._bert_tokenizer = \
            BertTokenizer.from_pretrained(model_dir,
                                          do_lower_case=self._model_config['do_lower'])

        output_config_file = os.path.join(model_dir, CONFIG_NAME)

        output_model_file = os.path.join(
            model_dir, "pytorch_model_ep{}.bin".format(epoch))

        config = BertConfig(output_config_file)

        self._model = BertForTokenClassification(config,
                                                 num_labels=len(
                                                     self._label_map))
        self._model.load_state_dict(
            torch.load(output_model_file,
                       map_location=lambda storage, loc: storage
                       if no_cuda else None))
        self._model.to(self._device)
        self._model.eval()

        return

    def classify_text(self, sentences):

        examples = NerProcessor.create_examples(sentences, 'test')

        features = [
            fe for ex in examples for fe in convert_examples_to_features(
                ex, self._label_to_id, self._max_seq_length,
                self._bert_tokenizer)
        ]

        data_loader = NerProcessor.make_data_loader(None,
                                                    self._batch_size,
                                                    self._local_rank,
                                                    self._label_to_id,
                                                    self._max_seq_length,
                                                    self._bert_tokenizer,
                                                    features=features,
                                                    sequential=True)

        prediction_tmp = model_predict(data_loader, self._device,
                                       self._label_map, self._model)

        assert len(prediction_tmp) == len(features)

        prediction = []
        prev_guid = None
        for fe, pr in zip(features, prediction_tmp):
            # longer sentences might have been processed in several steps
            # therefore we have to glue them together. This can be done on the basis of the guid.

            if prev_guid != fe.guid:
                prediction.append((fe.tokens[1:-1], pr))
            else:
                prediction[-1] = (prediction[-1][0] + fe.tokens[1:-1],
                                  prediction[-1][1] + pr)

            prev_guid = fe.guid

        try:
            assert len(sentences) == len(prediction)
        except AssertionError:
            print('Sentences:\n')
            print(sentences)
            print('\n\nPrediciton:\n')
            print(prediction)

        return prediction
def train_and_evaluate(OUTPUT_DIR, do_train=True, do_eval=True):
    """ Train and evaluate a BERT NER Model"""

    BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    NUM_TRAIN_EPOCHS = 5.0

    #in this steps lr will be low and training will be slow
    WARMUP_PROPORTION = 0.1

    if os.path.exists(OUTPUT_DIR) and os.listdir(OUTPUT_DIR) and do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                OUTPUT_DIR))
    if not os.path.exists(OUTPUT_DIR):
        os.makedirs(OUTPUT_DIR)

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased",
                                              do_lower_case=True)

    if do_train:
        train_examples, num_train_examples = create_datasets("AGE/train.txt")

        num_train_steps = int(
            math.ceil(num_train_examples / BATCH_SIZE * NUM_TRAIN_EPOCHS))
        num_warmup_steps = int(num_train_steps * WARMUP_PROPORTION)

        model = BertForTokenClassification.from_pretrained(
            "bert-base-uncased", num_labels=num_labels)
        model.to(device)

        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = [{
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.01
        }, {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]

        optimizer = BertAdam(optimizer_grouped_parameters,
                             lr=LEARNING_RATE,
                             warmup=WARMUP_PROPORTION,
                             t_total=num_train_steps)

        global_step = 0
        nb_tr_steps = 0
        tr_loss = 0

        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      MAX_SEQ_LENGTH,
                                                      tokenizer)

        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", num_train_examples)
        logger.info("  Batch size = %d", BATCH_SIZE)
        logger.info("  Num steps = %d", num_train_steps)

        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)

        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids)
        train_sampler = RandomSampler(train_data)

        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=BATCH_SIZE)

        model.train()
        # for name, param in model.named_parameters():
        # 	if param.requires_grad:
        # 		print(name)
        # return
        for _ in trange(int(NUM_TRAIN_EPOCHS), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids)
                loss.backward()

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                optimizer.step()
                optimizer.zero_grad()
                global_step += 1
            print(tr_loss)

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
        torch.save(model_to_save.state_dict(), output_model_file)
        output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)
        with open(output_config_file, 'w') as f:
            f.write(model_to_save.config.to_json_string())
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": "bert-base-uncased",
            "do_lower": True,
            "max_seq_length": MAX_SEQ_LENGTH,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(model_config,
                  open(os.path.join(OUTPUT_DIR, "model_config.json"), "w"))

    else:
        output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)
        output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
        config = BertConfig(output_config_file)
        model = BertForTokenClassification(config, num_labels=num_labels)
        model.load_state_dict(torch.load(output_model_file))

    model.to(device)

    if do_eval:

        EVAL_BATCH_SIZE = 32

        eval_examples, num_eval_examples = create_datasets("AGE/valid.txt")
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     MAX_SEQ_LENGTH, tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", num_eval_examples)
        logger.info("  Batch size = %d", EVAL_BATCH_SIZE)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids)
        # 	# Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=EVAL_BATCH_SIZE)
        model.eval()

        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            label_ids = label_ids.to(device)

            with torch.no_grad():
                logits = model(input_ids, segment_ids, input_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()
            for i, mask in enumerate(input_mask):
                temp_1 = []
                temp_2 = []
                for j, m in enumerate(mask):
                    if j == 0:
                        continue
                    if m:
                        if label_map[label_ids[i][j]] != "X":
                            temp_1.append(label_map[label_ids[i][j]])
                            temp_2.append(label_map[logits[i][j]])
                    else:
                        temp_1.pop()
                        temp_2.pop()
                        break
                y_true.append(temp_1)
                y_pred.append(temp_2)
        report = classification_report(y_true, y_pred)
        output_eval_file = os.path.join(OUTPUT_DIR, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
def predict(OUTPUT_DIR, in_sentences):
    """ predict a bert model 
		OUTPUT_DIR :: contains pretrained models
		in_sentences :: is a list of sentences on which tagging has to be performed
	"""
    PRED_BATCH_SIZE = 64

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model_config = os.path.join(OUTPUT_DIR, "model_config.json")
    model_config = json.load(open(model_config))
    output_config_file = os.path.join(OUTPUT_DIR, CONFIG_NAME)
    output_model_file = os.path.join(OUTPUT_DIR, WEIGHTS_NAME)
    config = BertConfig(output_config_file)
    model = BertForTokenClassification(config,
                                       num_labels=model_config["num_labels"])
    model.load_state_dict(torch.load(output_model_file))
    model.to(device)
    tokenizer = BertTokenizer.from_pretrained(
        model_config["bert_model"], do_lower_case=model_config["do_lower"])

    in_examples = [
        InputExample(guid="",
                     text_a=x,
                     text_b=None,
                     label=["O"] * len(x.split(" "))) for x in in_sentences
    ]
    in_features = convert_examples_to_features(in_examples, label_list,
                                               MAX_SEQ_LENGTH, tokenizer)

    all_input_ids = torch.tensor([f.input_ids for f in in_features],
                                 dtype=torch.long)
    all_input_mask = torch.tensor([f.input_mask for f in in_features],
                                  dtype=torch.long)
    all_segment_ids = torch.tensor([f.segment_ids for f in in_features],
                                   dtype=torch.long)

    pred_data = TensorDataset(all_input_ids, all_input_mask, all_segment_ids)
    # 	# Run prediction for full data
    pred_sampler = SequentialSampler(pred_data)
    pred_dataloader = DataLoader(pred_data,
                                 sampler=pred_sampler,
                                 batch_size=PRED_BATCH_SIZE,
                                 drop_last=False)
    model.eval()

    preds = []

    label_map = model_config["label_map"]

    for input_ids, input_mask, segment_ids in tqdm(pred_dataloader,
                                                   desc="Predicting"):
        input_ids = input_ids.to(device)
        input_mask = input_mask.to(device)
        segment_ids = segment_ids.to(device)

        with torch.no_grad():
            logits = model(input_ids, segment_ids, input_mask)

        logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
        logits = logits.detach().cpu().numpy()
        pred_batch = []
        for i, mask in enumerate(input_mask):
            temp_1 = []
            for j, m in enumerate(mask):
                if j == 0:
                    continue
                if m:
                    if label_map[str(logits[i][j])] != "X":
                        temp_1.append(label_map[str(logits[i][j])])
                else:
                    temp_1.pop()
                    break
            pred_batch.append(temp_1)
        preds.extend(pred_batch)
    return [(sentence, pred) for sentence, pred in zip(in_sentences, preds)]
Esempio n. 4
0
def model_eval(batch_size,
               label_map,
               processor,
               device,
               num_train_epochs=1,
               output_dir=None,
               model=None,
               local_rank=-1,
               no_cuda=False,
               dry_run=False):

    output_eval_file = None
    if output_dir is not None:
        output_eval_file = os.path.join(output_dir,
                                        processor.get_evaluation_file())
        logger.info('Write evaluation results to: {}'.format(output_eval_file))

    dataloader = processor.get_dev_examples(batch_size, local_rank)

    logger.info("***** Running evaluation *****")
    logger.info("  Num examples = %d", len(dataloader))
    logger.info("  Batch size = %d", batch_size)

    results = list()

    output_config_file = None
    if output_dir is not None:
        output_config_file = os.path.join(output_dir, CONFIG_NAME)

    for ep in trange(1, int(num_train_epochs) + 1, desc="Epoch"):

        if dry_run and ep > 1:
            logger.info("Dry run. Stop.")
            break

        if output_config_file is not None:
            # Load a trained model and config that you have fine-tuned
            output_model_file = os.path.join(
                output_dir, "pytorch_model_ep{}.bin".format(ep))

            if not os.path.exists(output_model_file):
                logger.info(
                    "Stopping at epoch {} since model file is missing.".format(
                        ep))
                break

            config = BertConfig(output_config_file)
            model = BertForTokenClassification(config,
                                               num_labels=len(label_map))
            model.load_state_dict(
                torch.load(output_model_file,
                           map_location=lambda storage, loc: storage
                           if no_cuda else None))
            model.to(device)

        if model is None:
            raise ValueError('Model required for evaluation.')

        model.eval()

        y_pred, y_true = model_predict_compare(dataloader, device, label_map,
                                               model, dry_run)

        lines = [
            'empty ' + 'XXX ' + v + ' ' + p for yt, yp in zip(y_true, y_pred)
            for v, p in zip(yt, yp)
        ]

        res = conll_eval(lines)

        # print(res)

        evals = \
            pd.concat([pd.DataFrame.from_dict(res['overall']['evals'], orient='index', columns=['ALL']),
                       pd.DataFrame.from_dict(res['slots']['LOC']['evals'], orient='index', columns=['LOC']),
                       pd.DataFrame.from_dict(res['slots']['PER']['evals'], orient='index', columns=['PER']),
                       pd.DataFrame.from_dict(res['slots']['ORG']['evals'], orient='index', columns=['ORG']),
                       ], axis=1).T

        stats = \
            pd.concat(
                [pd.DataFrame.from_dict(res['overall']['stats'], orient='index', columns=['ALL']),
                 pd.DataFrame.from_dict(res['slots']['LOC']['stats'], orient='index', columns=['LOC']),
                 pd.DataFrame.from_dict(res['slots']['PER']['stats'], orient='index', columns=['PER']),
                 pd.DataFrame.from_dict(res['slots']['ORG']['stats'], orient='index', columns=['ORG'])],
                axis=1, sort=True).T

        evals['epoch'] = ep
        stats['epoch'] = ep

        results.append(
            pd.concat([
                evals.reset_index().set_index(['index', 'epoch']),
                stats.reset_index().set_index(['index', 'epoch'])
            ],
                      axis=1))

        if output_eval_file is not None:
            pd.concat(results).to_pickle(output_eval_file)

    results = pd.concat(results)
    print(results)

    return results