Exemple #1
0
class NER_NERDA:

    def __init__(self,loader_pipeline, PARAMS):

        self.loader_pipeline = loader_pipeline
        self.training_hyperparameters = {'epochs' : int(PARAMS["epochs"]),
                                    'warmup_steps' : int(PARAMS["warmup_steps"]),
                                    'train_batch_size': 256,
                                    'learning_rate': PARAMS["learning_rate"]}
        self.transformer = PARAMS["transformer"]
        self.dropout = PARAMS["dropout"]

    def create_model(self,training, validation, tag_scheme):


        self.model = NERDA(
        dataset_training = training,
        dataset_validation = validation,
        tag_scheme = tag_scheme,
        tag_outside = 'O',
        transformer = self.transformer,
        dropout = self.dropout,
        hyperparameters = self.training_hyperparameters
        )
    
    def train(self,train_folder,test_folder,validation_split, detect = False):

        train_sents, train_ents, valid_sents, valid_ents, test_sents, test_ents = \
        self.loader_pipeline._load_corpus(train_folder, test_folder, validation_split, detect)
        
        tag_scheme = list(set(e for sent in train_ents for e in sent if e != "O"))
        training = {"sentences": train_sents, "tags": train_ents}
        validation = {"sentences": valid_sents, "tags": valid_ents}
        test = {"sentences": test_sents, "tags": test_ents}
        self.create_model(training,validation,tag_scheme)
        self.model.train()
        valid_res = self.model.predict(valid_sents)

        return eval_results(valid_ents, valid_res)

    def __call__(self, string):

        tokens = []
        spans = []

        for t,s in self.loader_pipeline(string):
            tokens.append(t)
            spans.append(s)

        return self.model.predict(tokens)

    def eval(self,tokens,labels):

        pred = self.model.predict(tokens)
        return eval_results(labels,pred)
Exemple #2
0
def objective(params):

    print(params)

    model = NERDA(dataset_training=get_dane_data('train', 20),
                  dataset_validation=get_dane_data('dev', 20),
                  hyperparameters=params)

    model.train()

    return model.valid_loss
Exemple #3
0
    def create_model(self,training, validation, tag_scheme):


        self.model = NERDA(
        dataset_training = training,
        dataset_validation = validation,
        tag_scheme = tag_scheme,
        tag_outside = 'O',
        transformer = self.transformer,
        dropout = self.dropout,
        hyperparameters = self.training_hyperparameters
        )
Exemple #4
0
def test_training_bert():
    """Test if traning does not break even though MAX LEN is exceeded"""
    m = NERDA(dataset_training=get_dane_data('train', 5),
              dataset_validation=get_dane_data('dev', 5),
              transformer='bert-base-multilingual-uncased',
              hyperparameters={
                  'epochs': 1,
                  'warmup_steps': 10,
                  'train_batch_size': 5,
                  'learning_rate': 0.0001
              })
    m.train()
Exemple #5
0
def test_training_exceed_maxlen():
    """Test if traning does not break even though MAX LEN is exceeded"""
    m = NERDA(dataset_training=get_dane_data('train', 5),
              dataset_validation=get_dane_data('dev', 5),
              max_len=3,
              transformer='Maltehb/-l-ctra-danish-electra-small-uncased',
              hyperparameters={
                  'epochs': 1,
                  'warmup_steps': 10,
                  'train_batch_size': 5,
                  'learning_rate': 0.0001
              })
    m.train()
Exemple #6
0
from NERDA.datasets import get_dane_data
from NERDA.models import NERDA
import nltk

# instantiate a minimal model.
model = NERDA(dataset_training=get_dane_data('train', 5),
              dataset_validation=get_dane_data('dev', 5),
              transformer='Maltehb/-l-ctra-danish-electra-small-uncased',
              hyperparameters={
                  'epochs': 1,
                  'warmup_steps': 10,
                  'train_batch_size': 5,
                  'learning_rate': 0.0001
              })

# set example texts to identify entities in.
text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle"
sentences = [nltk.word_tokenize(text_single)]


def test_predict():
    """Test that predict runs"""
    predictions = model.predict(sentences)


predictions = model.predict(sentences)


def test_predict_type():
    """Test token predictions"""
    assert isinstance(predictions, list)
              ]



transformer = 'bert-base-multilingual-uncased'

# hyperparameters for network
dropout = 0.1
# hyperparameters for training
training_hyperparameters = {
'epochs' : 4,
'warmup_steps' : 500,                                                   'train_batch_size': 13,                                         'learning_rate': 0.0001
}

from NERDA.models import NERDA
model = NERDA(
dataset_training = training,
dataset_validation = validation,
tag_scheme = tag_scheme, 
tag_outside = 'O',
transformer = transformer,
dropout = dropout,
hyperparameters = training_hyperparameters
)

model.train()

import nltk
nltk.download('punkt')

model.predict_text('Prime Minister Jacinda Ardern has claimed that New Zealand had won a big battle over the spread of coronavirus. Her words came as the country begins to exit from its lockdown')
Exemple #8
0
# HACK: Filename prefixed with 'aaa' to execute this test before the others
# in order to download necessary ressources for all other tests.

from NERDA.datasets import get_dane_data, download_dane_data
# TODO: should not be necesssary to download before importing NERDA.
# Download necessary ressources
download_dane_data()
from NERDA.models import NERDA
from NERDA.precooked import DA_ELECTRA_DA
import nltk
nltk.download('punkt')

# instantiate a minimal model.
model = NERDA(dataset_training=get_dane_data('train', 5),
              dataset_validation=get_dane_data('dev', 5),
              max_len=128,
              transformer='Maltehb/-l-ctra-danish-electra-small-uncased',
              hyperparameters={
                  'epochs': 1,
                  'warmup_steps': 10,
                  'train_batch_size': 5,
                  'learning_rate': 0.0001
              })


def test_instantiate_NERDA():
    """Test that model has the correct/expected class"""
    assert isinstance(model, NERDA)
Exemple #9
0
#model.train()
#k=0
#trn['sentences'][3111]
#from transformers import AutoTokenizer
#t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased')
#valid = get_conll_data('valid')

<<<<<<< HEAD:admin/sandbox.py

transformer = "google/electra-small-discriminator"
from transformers import AutoTokenizer, AutoModel, AutoConfig 
trans = AutoConfig.from_pretrained(transformer)

def tester():

    try:
        model = AutoModel.from_pretrained('google/electra-small-discriminator')
    except:
        print("Oops!", sys.exc_info()[0], "occurred.")

    return model
=======
from NERDA.datasets import get_dane_data
trn = get_conll_data('train', 5)
valid = get_conll_data('dev', 5)
transformer = 'bert-base-multilingual-uncased',
model = NERDA(transformer = transformer,
              dataset_training = trn,
              dataset_validation = valid)
>>>>>>> b5eea087ece5f61ec70aa3f99cd4c99b418ebb92:sandbox.py
Exemple #10
0
    'B-trigger', 'I-trigger', 'B-participant', 'I-participant', 'B-organizer',
    'B-target', 'I-target', 'B-place', 'B-etime', 'I-etime', 'B-fname',
    'I-fname', 'I-organizer', 'I-place'
]

transformer = "roberta-base"  # bert-base-uncased bert-base-cased albert-base-v2  roberta-base

max_len = max([len(sent) for sent in training["sentences"]])

model = NERDA(
    dataset_training=training,
    dataset_validation=validation,
    tag_scheme=tag_scheme,
    tag_outside="O",
    transformer=transformer,
    max_len=max_len,
    dropout=0.1,  #   Higher dropouts can be used.
    validation_batch_size=8,
    hyperparameters={
        "train_batch_size": 8
    }  # Higher batch sizes caused issues in our training runs, but we think that it is not ideal to leave this as low as 8.
)

model.train()

print(model.evaluate_performance(validation, batch_size=1))

# Predict the test data and dump it to a text file. However, this may create minor format issues which we solved with our hands as post processing.
res = ""
for i in tqdm(range(len(testing["sentences"]))):
    res += testing["sentences"][i] + "\t" + model.predict(
Exemple #11
0


# hyperparameters for network
dropout = 0.1
# hyperparameters for training
training_hyperparameters = {
'epochs' : 1,'warmup_steps' : 500,'train_batch_size': 1,'learning_rate': 0.0001
}

from NERDA.models import NERDA
model = NERDA(
dataset_training = training,
dataset_validation = validation,
tag_scheme = tag_scheme, 
tag_outside = 'O',
transformer = transformer,
dropout = dropout,
hyperparameters = training_hyperparameters
)

model.train()

# test = get_conll_data('test')
# model.evaluate_performance(test)

#eng train data on pl bert
# Level	F1-Score
# 0	B-PER	0.884507
# 1	I-PER	0.941590
# 2	B-ORG	0.771963