def test_named_entity_recognition():
    # Creating train_df  and eval_df for demonstration
    train_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "started", "O"],
        [1, "with", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "can", "O"],
        [1, "now", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    train_df = pd.DataFrame(train_data,
                            columns=["sentence_id", "words", "labels"])

    eval_data = [
        [0, "Simple", "B-MISC"],
        [0, "Transformers", "I-MISC"],
        [0, "was", "O"],
        [1, "built", "O"],
        [1, "for", "O"],
        [0, "text", "O"],
        [0, "classification", "B-MISC"],
        [1, "Simple", "B-MISC"],
        [1, "Transformers", "I-MISC"],
        [1, "then", "O"],
        [1, "expanded", "O"],
        [1, "to", "O"],
        [1, "perform", "O"],
        [1, "NER", "B-MISC"],
    ]
    eval_df = pd.DataFrame(eval_data,
                           columns=["sentence_id", "words", "labels"])

    # Create a NERModel
    model = NERModel(
        "bert",
        "bert-base-cased",
        args={
            "no_save": True,
            "overwrite_output_dir": True,
            "reprocess_input_data": False
        },
        use_cuda=False,
    )

    # Train the model
    model.train_model(train_df)

    # Evaluate the model
    result, model_outputs, predictions = model.eval_model(eval_df)

    # Predictions on arbitary text strings
    predictions, raw_outputs = model.predict(["Some arbitary sentence"])
Beispiel #2
0
def training():
    wandb.init()

    model = NERModel("roberta",
                     "roberta-base",
                     use_cuda=True,
                     args=model_args,
                     sweep_config=wandb.config)
    # model = NERModel("distilbert", "distilbert-base-cased", use_cuda=True, args=model_args, sweep_config=wandb.config)
    model.train_model(train_df, eval_data=trial_df)

    wandb.join()
def main(trainingdataset, testdataset, outputdir):

    print()
    # Creat TransformerModel: NERModel: model_class, model_type from huggingface
    # several attributes can be changes with args -> see self.args, i.e.
    # args={'learning_rate': 2e-5, 'overwrite_output_dir': True, 'reprocess_input_data': True}

    #eval_init_df = pd.DataFrame()
    #eval_init_df = pd.DataFrame(testdataset, columns=['sentence_id', 'words', 'labels']) #is this structure same for wikiaan?
    #can't simply pass a txt to create a dataframe, need to have csv ...
    torch.cuda.empty_cache()
    model = NERModel(
        'bert',
        'bert-base-multilingual-cased',
        labels=[
            "O", "B-MISC", "I-MISC", "B-PER", "I-PER", "B-ORG", "I-ORG",
            "B-LOC", "I-LOC"
        ],
        #use_cuda=False,
        args={
            'save_model_every_epoch': False,
            'save_steps': 10000,
            'output_dir': outputdir,
            'evaluate_during_training': True,
            'overwrite_output_dir': True,
            'classification_report': True,
            'save_eval_checkpoints': False
        })

    #changes Philine in passed arg parameters - 03.04.2020:
    # - evaluate_during_training: Perform evaluations during training - after every step defined in parameter: evaluate_during_training_steps, by default 2000 ->!eval_df has to be given as an input to train_model()
    # - classification_report of each label, will be added to eval_results.txt file
    # save_eval_checkpoints:False -> model won't be saved after each checkpoint -> improving of execution?
    # save_model_every_epoch can also be set to False!

    # Train the model
    model.train_model(
        trainingdataset, eval_df=testdataset
    )  # Make sure eval_df is passed to the training method if enabled.
Beispiel #4
0
class BioAnalysis:
    def __init__(self,
                 train_file="./data/train.tsv",
                 dev_file="./data/dev.tsv",
                 test_file="./data/test.tsv"):
        self.train_data = file_opener(train_file)
        self.dev_data = file_opener(dev_file)
        self.test_data = file_opener(test_file)
        self.test_data.pop(192)
        self.crf = sklearn_crfsuite.CRF(algorithm='lbfgs',
                                        c1=0.09684573395986483,
                                        c2=0.0800864058815976,
                                        max_iterations=100,
                                        all_possible_transitions=True)
        self.dev_predicted = None
        self.test_predicted = None
        self.dev_labels = None
        self.test_labels = None
        self.labels = [
            'B-Abiotic_Entity', 'B-Aggregate_Biotic_Abiotic_Entity',
            'B-Biotic_Entity', 'B-Eventuality', 'B-Location', 'B-Quality',
            'B-Time', 'B-Unit', 'B-Value', 'I-Abiotic_Entity',
            'I-Aggregate_Biotic_Abiotic_Entity', 'I-Biotic_Entity',
            'I-Eventuality', 'I-Location', 'I-Quality', 'I-Time', 'I-Unit',
            'I-Value', 'O'
        ]

        self.roberta_nel_model = None
        self.roberta_nel_dev_eval = None
        self.roberta_nel_test_eval = None
        self.roberta_nel_dev_links = None
        self.roberta_nel_test_links = None
        self.roberta_nel_train_data, _ = get_roberta_nel_data(self.train_data)
        self.roberta_nel_dev_data, self.roberta_nel_dev_spans = get_roberta_nel_data(
            self.dev_data)
        self.roberta_nel_test_data, self.roberta_nel_test_spans = get_roberta_nel_data(
            self.test_data)

        self.roberta_ner_model = None
        self.roberta_ner_dev_eval = None
        self.roberta_ner_test_eval = None
        self.roberta_ner_train_data = get_roberta_ner_data(self.train_data)
        self.roberta_ner_dev_data = get_roberta_ner_data(self.dev_data)
        self.roberta_ner_test_data = get_roberta_ner_data(self.test_data)

    def crf_fit(self):
        self.crf.fit(*get_features_labels(self.train_data))

    def crf_predict(self):
        dev_feat, self.dev_labels = get_features_labels(self.dev_data)
        test_feat, self.test_labels = get_features_labels(self.test_data)
        self.dev_predicted = self.crf.predict(dev_feat)
        self.test_predicted = self.crf.predict(test_feat)

    def crf_evaluate(self, verbose=False, labels=False):
        if labels:
            lab = labels
        else:
            lab = self.crf.classes_
            lab.remove("O")
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            print("Dev Results\n===========")
            dev_args = (self.dev_labels, self.dev_predicted)
            kwargs = {"average": "weighted", "labels": lab}
            if verbose:
                print("Precision:",
                      metrics.flat_precision_score(*dev_args, **kwargs))
                print("Recall:",
                      metrics.flat_recall_score(*dev_args, **kwargs))
            print("F1:", metrics.flat_f1_score(*dev_args, **kwargs))
            test_args = (self.test_labels, self.test_predicted)
            print("\nTest Results\n============")
            if verbose:
                print("Precision:",
                      metrics.flat_precision_score(*test_args, **kwargs))
                print("Recall:",
                      metrics.flat_recall_score(*test_args, **kwargs))
            print("F1:", metrics.flat_f1_score(*test_args, **kwargs))

    def roberta_nel_train(self):
        train_df = pd.DataFrame(self.roberta_nel_train_data)
        self.roberta_nel_model = ClassificationModel(
            "roberta",
            "roberta-base",
            args={
                "num_train_epochs": 3,
                "overwrite_output_dir": True,
                "output_dir": "nel_outputs/"
            })
        self.roberta_nel_model.train_model(train_df)

    def roberta_nel_eval(self):
        dev_df = pd.DataFrame(self.roberta_nel_dev_data)
        test_df = pd.DataFrame(self.roberta_nel_test_data)
        self.roberta_nel_dev_eval = self.roberta_nel_model.eval_model(
            dev_df, acc=f1_score)
        self.roberta_nel_test_eval = self.roberta_nel_model.eval_model(
            test_df, acc=f1_score)
        print("Dev NEL Results\n===========")
        print("F1:", self.roberta_nel_dev_eval[0]["acc"])
        print("\nTest NEL Results\n============")
        print("F1:", self.roberta_nel_test_eval[0]["acc"])

    def roberta_nel_load_model(self):
        self.roberta_nel_model = ClassificationModel(
            "roberta", "nel_outputs/", args={"num_train_epochs": 3})

    def roberta_ner_train(self):
        train_df = pd.DataFrame(self.roberta_ner_train_data,
                                columns=['sentence_id', 'words', 'labels'])
        self.roberta_ner_model = NERModel("roberta",
                                          "roberta-base",
                                          labels=self.labels,
                                          args={
                                              "num_train_epochs": 3,
                                              "overwrite_output_dir": True,
                                              "output_dir": "ner_outputs/"
                                          })
        self.roberta_ner_model.train_model(train_df)

    def roberta_ner_eval(self):
        dev_df = pd.DataFrame(self.roberta_ner_dev_data,
                              columns=['sentence_id', 'words', 'labels'])
        test_df = pd.DataFrame(self.roberta_ner_test_data,
                               columns=['sentence_id', 'words', 'labels'])
        self.roberta_ner_dev_eval = self.roberta_ner_model.eval_model(
            dev_df, "./ner_outputs/")
        self.roberta_ner_test_eval = self.roberta_ner_model.eval_model(
            test_df, "./ner_outputs/")
        print("Dev NER Results\n===========")
        print("Precision:", self.roberta_ner_dev_eval[0]["precision"])
        print("Recall:", self.roberta_ner_dev_eval[0]["recall"])
        print("F1:", self.roberta_ner_dev_eval[0]["f1_score"])
        print("\nTest NER Results\n============")
        print("Precision:", self.roberta_ner_test_eval[0]["precision"])
        print("Recall:", self.roberta_ner_test_eval[0]["recall"])
        print("F1:", self.roberta_ner_test_eval[0]["f1_score"])

    def roberta_ner_load_model(self):
        self.roberta_ner_model = NERModel("roberta",
                                          "ner_outputs/",
                                          labels=self.labels,
                                          args={"num_train_epochs": 3})

    def roberta_ner_nel_pipeline(self):
        try:
            self.roberta_ner_load_model()
        except OSError:
            self.roberta_ner_train()
        self.roberta_ner_eval()

        roberta_dev_phrases = deepcopy(self.dev_data)
        for ii in range(len(roberta_dev_phrases)):
            for jj in range(len(roberta_dev_phrases[ii])):
                roberta_dev_phrases[ii][jj] = list(roberta_dev_phrases[ii][jj])
                roberta_dev_phrases[ii][jj][2] = self.roberta_ner_dev_eval[2][
                    ii][jj]
        roberta_dev_phrases, roberta_dev_spans = get_roberta_nel_data(
            roberta_dev_phrases)

        roberta_test_phrases = deepcopy(self.test_data)
        for ii in range(len(roberta_test_phrases)):
            for jj in range(len(roberta_test_phrases[ii])):
                roberta_test_phrases[ii][jj] = list(
                    roberta_test_phrases[ii][jj])
                roberta_test_phrases[ii][jj][2] = self.roberta_ner_test_eval[
                    2][ii][jj]
        roberta_test_phrases, roberta_test_spans = get_roberta_nel_data(
            roberta_test_phrases)
        try:
            self.roberta_nel_load_model()
        except OSError:
            self.roberta_nel_train()
        roberta_dev_prediction = self.roberta_nel_model.predict(
            [x[0] for x in roberta_dev_phrases])[0]
        roberta_test_prediction = self.roberta_nel_model.predict(
            [x[0] for x in roberta_test_phrases])[0]

        roberta_dev_actual = [x[1] for x in self.roberta_nel_dev_data]
        roberta_test_actual = [x[1] for x in self.roberta_nel_test_data]

        dev_prediction = transform_nel_results(roberta_dev_prediction,
                                               roberta_dev_spans)
        dev_actual = transform_nel_results(roberta_dev_actual,
                                           self.roberta_nel_dev_spans)
        dev_actual, dev_prediction = resolve_diff(dev_actual, dev_prediction)

        test_prediction = transform_nel_results(roberta_test_prediction,
                                                roberta_test_spans)
        test_actual = transform_nel_results(roberta_test_actual,
                                            self.roberta_nel_test_spans)
        test_actual, test_prediction = resolve_diff(test_actual,
                                                    test_prediction)
        print("Dev NEL Combined Results\n===========")
        print("F1:", f1_score(dev_actual, dev_prediction))
        print("Test NEL Combined Results\n===========")
        print("F1:", f1_score(test_actual, test_prediction))

        dev_output = list(
            zip([x[0] for x in roberta_dev_phrases], roberta_dev_prediction))
        self.roberta_nel_dev_links = get_links(dev_output)
        test_output = list(
            zip([x[0] for x in roberta_test_phrases], roberta_test_prediction))
        self.roberta_nel_test_links = get_links(test_output)
    "num_train_epochs": 10,
    "evaluate_during_training_steps": 10000,
    "train_batch_size": 32,
    'cross_entropy_ignore_index': 0,
    'classification_report': True
}

model = NERModel("electra",
                 "discriminator_trained/discriminator_model",
                 args=train_args,
                 labels=labels,
                 use_cuda=True,
                 crf=True)

# Train the model
model.train_model(train_file, eval_data=eval_file)

# Evaluate the model
test_file = 'data_set/test.ner.small.txt'
result, model_outputs, predictions = model.eval_model(train_file)

print(result)

# from transformers import ElectraTokenizer, ElectraForPreTraining
# model_name = r'D:\git_learn\simpletransformers\examples\language_model\discriminator_trained\discriminator_model'
# model = ElectraForPreTraining.from_pretrained(model_name)
# tokenizer = ElectraTokenizer.from_pretrained(model_name)
# sentence = '发烧头[MASK]3天'
# sentence = '患者自发病来,神志清楚,精神好'
# input_ids = torch.tensor(tokenizer.encode(sentence, add_special_tokens=True)).unsqueeze(0)
# output = model(input_ids, return_dict=True)
Beispiel #6
0
    [0, "text", "O"],
    [0, "classification", "B-MISC"],
    [1, "Simple", "B-MISC"],
    [1, "Transformers", "I-MISC"],
    [1, "then", "O"],
    [1, "expanded", "O"],
    [1, "to", "O"],
    [1, "perform", "O"],
    [1, "NER", "B-MISC"],
]
eval_df = pd.DataFrame(eval_data, columns=["sentence_id", "words", "labels"])

# Create a NERModel
model = NERModel("bert",
                 "bert-base-cased",
                 args={
                     "overwrite_output_dir": True,
                     "reprocess_input_data": True
                 })

# Train the model
model.train_model(train_df)

# Evaluate the model
result, model_outputs, predictions = model.eval_model(eval_df)

# Predictions on arbitary text strings
predictions, raw_outputs = model.predict(["Some arbitary sentence"])

print(predictions)
Beispiel #7
0
label=data["labels"].unique().tolist()

label

args=NERArgs()
args.num_train_epochs=1
args.learning_rate=1e-4
args.overwrite_output_dir=True
args.train_batch_size=32
args.eval_batch_size=32



model=NERModel('bert', 'bert-base-cased', labels=label, args=args)

model.train_model(train_data, eval_data=test_data, acc=accuracy_score)

result, model_outputs, preds_list=model.eval_model(test_data)

result

prediction, model_output=model.predict(["This is Nishi"])

prediction

!pip install bert-extractive-summarizer

!pip install wikipedia

import wikipedia
class NerModel:
    def __init__(self, modelname="", dataset=None, use_saved_model=False):
        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()
        labels_list = dataset['labels_list']

        output_dir = "outputs_{}".format(modelname)
        # Create a NERModel
        model_args = {
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            
            'train_batch_size': 10, # 10
            'num_train_epochs': 10,   # 5
            'max_seq_length': 256,
            'gradient_accumulation_steps': 8,

            'labels_list': labels_list
        }
                
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", "bert-base-cased", use_cuda=False, args=model_args)
            # args={"overwrite_output_dir": True, "reprocess_input_data": True}

    def train(self):
        # # Train the model
        if self.dataset:
            self.model.train_model(self.dataset['train'])
        else:
            raise Exception("dataset is None")

    def eval(self):
        # # Evaluate the model
        if self.dataset:
            result, model_outputs, predictions = self.model.eval_model(self.dataset['val'])
            print("Evaluation result:", result)
        else:
            raise Exception("dataset is None")

    def simple_test(self):
        # Predictions on arbitary text strings
        sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
        predictions, raw_outputs = self.model.predict(sentences)
        print(predictions)

        # More detailed preditctions
        for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
            print("\n___________________________")
            print("Sentence: ", sentences[n])
            for pred, out in zip(preds, outs):
                key = list(pred.keys())[0]
                new_out = out[key]
                preds = list(softmax(np.mean(new_out, axis=0)))
                print(key, pred[key], preds[np.argmax(preds)], preds)

    def predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        return predictions
Beispiel #9
0
# Download CoNLL-2003 dataset from the git repo down below.
# https://github.com/synalp/NER/tree/master/corpus/CoNLL-2003

from simpletransformers.ner import NERModel

# BERT UNCASED
model = NERModel('bert', 'bert-base-uncased', args={'overwrite_output_dir':True})
model.train_model('CoNLL-2003/eng.train')
results, model_outputs, predictions = model.eval_model('CoNLL-2003/eng.testb')
print(results)

# BERT CASED
model = NERModel('bert', 'bert-base-cased', args={'overwrite_output_dir':True})
model.train_model('CoNLL-2003/eng.train')
results, model_outputs, predictions = model.eval_model('CoNLL-2003/eng.testb')
print(results)

# ALBERT 
model = NERModel('albert', 'albert-base-v2', args={'overwrite_output_dir':True})
model.train_model('CoNLL-2003/eng.train')
results, model_outputs, predictions = model.eval_model('CoNLL-2003/eng.testb')
print(results)
Beispiel #10
0
class NerModel:
    def __init__(self,
                 modelname="",
                 dataset=None,
                 use_saved_model=False,
                 input_dir=None,
                 output_dir=None):

        #pretrained_model_name = "lm_outputs_test/from_scratch/best_model"
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        self.dataset = dataset
        #labels_list = ["O", "B-ACT",  "I-ACT", "B-OBJ", "I-OBJ", "B-VAL", "I-VAL", "B-VAR", "I-VAR"]
        #labels_list = dataset.get_labels_list()

        labels_list = dataset['labels_list']
        #labels_list = ['O', 'B-ACT', 'I-ACT', 'B-OBJ', 'I-OBJ', 'B-CNT', 'I-CNT',
        #    'B-OPE', 'I-OPE', 'B-ORD', 'B-PRE', 'I-PRE', 'B-TYP',
        #    'B-VAL', 'I-VAL', 'B-ATT', 'I-ATT', 'B-VAR', 'I-VAR']

        #output_dir = "outputs_{}".format(modelname)
        os.system("{} -rf".format(output_dir))

        use_cuda = torch.cuda.is_available()

        # Create a NERModel
        model_args = {
            'labels_list': labels_list,
            'output_dir': output_dir,
            'overwrite_output_dir': True,
            'reprocess_input_data': True,
            'save_eval_checkpoints': False,
            'save_steps': -1,
            'save_model_every_epoch': False,
            #'no_save' : True,
            #'no_cache': True,
            'evaluate_during_training': True,
            'num_train_epochs': 15,  # 5
            'train_batch_size': 10,  # 10   (<=10 for bert, <=5 for longformer)
            'eval_batch_size': 10,
            'max_seq_length': 128,  # default 128
            'gradient_accumulation_steps': 8,
            'learning_rate':
            0.0001,  # default 4e-5; a good value is 0.0001 for albert

            #'max_position_embeddings': 64,
        }

        #self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args)
        #self.model = NERModel("bert", "bert-base-uncased", use_cuda=False, args=model_args)
        #self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args)
        #self.model = NERModel("longformer", pretrained_model_name, use_cuda=False, args=model_args)
        #self.model = NERModel("xlmroberta", "xlm-roberta-base", use_cuda=False, args=model_args)
        #self.model = NERModel("albert", "albert-base-v2", use_cuda=False, args=model_args)
        #self.model = NERModel("electra", 'google/electra-small-generator', use_cuda=False, args=model_args)
        #self.model = NERModel("layoutlm", 'microsoft/layoutlm-base-uncased', use_cuda=False, args=model_args)
        #self.model = NERModel("distilbert", "distilbert-base-cased-distilled-squad", use_cuda=False, args=model_args)

        #model_type, english_model_name  = "longformer", "allenai/longformer-base-4096"
        #model_type, english_model_name  = "mpnet", "microsoft/mpnet-base"
        #model_type, english_model_name  = "electra", "google/electra-small-discriminator"
        #model_type, english_model_name  = "squeezebert", "squeezebert/squeezebert-uncased"
        #model_type, english_model_name  = "albert", "albert-base-v2"
        #model_type, english_model_name  = "xlmroberta", "xlm-roberta-base"
        model_type, english_model_name = "roberta", "distilroberta-base"
        #model_type, english_model_name  = "bert", "bert-base-uncased"
        #model_type, english_model_name  = "distilbert", "distilbert-base-uncased"

        if input_dir:
            # Use a previously trained model (on NER or LM tasks)
            self.model = NERModel(model_type,
                                  input_dir,
                                  use_cuda=use_cuda,
                                  args=model_args)
        else:
            # Use a pre-trained (English) model
            self.model = NERModel(model_type,
                                  english_model_name,
                                  use_cuda=use_cuda,
                                  args=model_args)  # force_download=True
        """
        if use_saved_model:
            if path:
                # Use a model located in a given folder
                self.model = NERModel("longformer", path, use_cuda=False, args=model_args)
            else:
                # Use a previously trained model (on NER or LM tasks)
                self.model = NERModel("longformer", output_dir, use_cuda=False, args=model_args)
        else:
            # Use a pre-trained (English) model
            self.model = NERModel("longformer", "allenai/longformer-base-4096", use_cuda=False, args=model_args)
        """
        """
        if use_saved_model:
            self.model = NERModel("bert", output_dir, use_cuda=False, args=model_args)
        else:
            self.model = NERModel("bert", pretrained_model_name, use_cuda=False, args=model_args)
            # args={"overwrite_output_dir": True, "reprocess_input_data": True}
        """

        self.model_info = {
            'model_type': model_type,
            'english_model_name': english_model_name,
            'input_dir': input_dir
        }

    def train(self):
        # # Train the model
        if self.dataset:
            global_step, training_details = self.model.train_model(
                self.dataset['train'], eval_data=self.dataset['val'])
        else:
            raise Exception("dataset is None")

        print("global_step:", global_step)
        print("training_details:", training_details)
        #training_details: {'global_step': [4], 'precision': [0.6987951807228916], 'recall': [0.402777777777777
        #8], 'f1_score': [0.5110132158590308], 'train_loss': [0.41127926111221313], 'eval_loss': [0.63655577600
        #00229]}
        # it contains f1_score only for the validation dataset
        return training_details

    def eval(self):
        # # Evaluate the model
        if self.dataset:
            res_train, model_outputs, predictions = self.model.eval_model(
                self.dataset['train'])
            res_val, model_outputs, predictions = self.model.eval_model(
                self.dataset['val'])
            print("Evaluation")
            #print("On train data:", result)
            #{'eval_loss': 0.8920, 'precision': 0.0833, 'recall': 0.027, 'f1_score': 0.0416}
            print("train loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}".
                  format(res_train['eval_loss'], res_train['precision'],
                         res_train['recall'], res_train['f1_score']))
            #print("On validation data:", result)
            print("valid loss: {:.3f}; prec/recall/f1: {:.3f}/{:.3f}/{:.3f}".
                  format(res_val['eval_loss'], res_val['precision'],
                         res_val['recall'], res_val['f1_score']))
            print(
                "Summary. Loss (train/val): {:.3f}/{:.3f}, F1: {:.3f}/{:.3f}".
                format(res_train['eval_loss'], res_val['eval_loss'],
                       res_train['f1_score'], res_val['f1_score']))
        else:
            raise Exception("dataset is None")

        print("model_info:", self.model_info)

        return res_val

    def test(self):
        sentence_id = self.dataset['test']['sentence_id']
        words = self.dataset['test']['words']
        labels = self.dataset['test']['labels']

        prev_id = 0
        s_words = []
        s_labels = []
        samples = []

        for i in range(len(sentence_id)):
            s_id = sentence_id[i]
            word = words[i]
            label = labels[i]

            if s_id != prev_id:
                sentence = " ".join(s_words)
                #print("sentence id={}: {}".format(prev_id, sentence))
                samples.append({
                    'text': sentence,
                    'tokens': s_words,
                    'labels': s_labels
                })
                #print("s_labels: {}".format(s_labels))
                s_words = []
                s_labels = []
                prev_id = s_id

            s_words.append(words[i])
            s_labels.append(labels[i])
            #print("i={}, word={}, label={}".format(s_id, word, label))

        sentence = " ".join(s_words)
        #print("sentence id={}: {}".format(prev_id, sentence))
        samples.append({
            'text': sentence,
            'tokens': s_words,
            'labels': s_labels
        })

        texts = [sample['text'] for sample in samples]
        predictions, raw_outputs = self.model.predict(texts)
        #print(predictions)

        acc_list = []
        success_list = []

        # More detailed preditctions
        for i, (preds, raw_outs) in enumerate(zip(predictions, raw_outputs)):
            print()
            print("text: ", texts[i])
            #print("\npreds: ", preds)
            pred_labels = [list(t.values())[0] for t in preds]
            print("pred_labels: ", pred_labels)
            true_labels = samples[i]['labels']
            print("true_labels: ", true_labels)
            #print("raw_outs: ", raw_outs)

            if len(true_labels) != len(pred_labels):
                raise Exception("len(true_labels) != len(pred_labels)")
            comp = [
                true_labels[i] == pred_labels[i]
                for i in range(len(pred_labels))
            ]
            acc1sentence = np.mean(comp)
            print("acc={:.3f}".format(acc1sentence))
            acc_list.append(acc1sentence)
            success = 1 if acc1sentence == 1.0 else 0
            success_list.append(success)

        avg_acc = np.mean(acc_list)
        avg_success = np.mean(success_list)

        return {'avg_acc': avg_acc, 'avg_success': avg_success}

        #for pred, out in zip(preds, outs):
        #print("pred:", pred)
        #print("out:", out)
        #key = list(pred.keys())[0]
        #new_out = out[key]
        #preds = list(softmax(np.mean(new_out, axis=0)))
        #print(key, pred[key], preds[np.argmax(preds)], preds)

    def simple_test(self):
        # Predictions on arbitary text strings
        sentences = ["Some arbitary sentence", "Simple Transformers sentence"]
        predictions, raw_outputs = self.model.predict(sentences)
        print(predictions)

        # More detailed preditctions
        for n, (preds, outs) in enumerate(zip(predictions, raw_outputs)):
            print("\n___________________________")
            print("Sentence: ", sentences[n])
            for pred, out in zip(preds, outs):
                key = list(pred.keys())[0]
                new_out = out[key]
                preds = list(softmax(np.mean(new_out, axis=0)))
                print(key, pred[key], preds[np.argmax(preds)], preds)

    def predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        #tokenized_sentences = [self.tokenizer.tokenize(sentence) for sentence in sentences]
        #predictions, raw_outputs = self.model.predict(tokenized_sentences, split_on_space=False)
        return predictions

    def raw_predict(self, sentences):
        predictions, raw_outputs = self.model.predict(sentences)
        #print("raw_outputs:", raw_outputs)
        #print(self.model.args.labels_list)
        labels_list = self.model.args.labels_list
        confidences = [
            calc_confidence(raw_output, labels_list)
            for raw_output in raw_outputs
        ]
        #print("confidence:", confidence)
        return {
            'predictions': predictions,
            'raw_outputs': raw_outputs,
            'confidences': confidences
        }
        """
Beispiel #11
0
conllu_format = ddt.load_as_conllu()
L = [(i, token.form, token.misc.get("name").pop()) for i, sent in enumerate(conllu_format) for token in sent]
df = pd.DataFrame(L, columns=['sentence_id', 'words', 'labels'])

torch.save(model, "model.bin")
train_data = [
    [0, 'Simple', 'B-MISC'], [0, 'Transformers', 'I-MISC'], [0, 'started', 'O'], [1, 'with', 'O'], [0, 'text', 'O'], [0, 'classification', 'B-MISC'],
    [1, 'Simple', 'B-MISC'], [1, 'Transformers', 'I-MISC'], [1, 'can', 'O'], [1, 'now', 'O'], [1, 'perform', 'O'], [1, 'NER', 'B-MISC']
]
train_df = pd.DataFrame(train_data, columns=['sentence_id', 'words', 'labels'])

model = NERModel('bert', '/home/au554730/Desktop/BERT_test/danish_bert_pytorch/', use_cuda = False)

sub = df.head(100)

model.train_model(sub)


# Værsgo Kenneth
tokenized_texts = []
mylabels = []
for sent, tags in zip(sentences,labels):
BERT_texts = []
BERT_labels = np.array([])
for word, tag in zip(sent.split(),tags):
sub_words = tokenizer.wordpiece_tokenizer.tokenize(word)
tags = np.array([tag for x in sub_words])
tags[1:] = ‘X’
BERT_texts += sub_words
BERT_labels = np.append(BERT_labels,tags)
mytexts.append(BERT_texts)
Beispiel #12
0
        "wandb_project": None,
        "wandb_kwargs": {},
        "use_early_stopping": True,
        "early_stopping_patience": 3,
        "early_stopping_delta": 0,
        "early_stopping_metric": "eval_loss",
        "early_stopping_metric_minimize": True,
        "manual_seed": None,
        "encoding": None,
        "config": {},
    }

    with open('tag-set.txt', 'r') as f:
        ents_dict = set(ast.literal_eval(f.read()))

    print(ents_dict)
    # Create a NERModel
    model = NERModel('bert',
                     'bert-base-cased',
                     args=args,
                     use_cuda=False,
                     labels=ents_dict)

    model.train_model('sample_data/train.txt',
                      eval_data='sample_data/test.txt')

    results, model_outputs, predictions = model.eval_model(
        'sample_data/test.txt')

    print(results)