class NER_NERDA: def __init__(self,loader_pipeline, PARAMS): self.loader_pipeline = loader_pipeline self.training_hyperparameters = {'epochs' : int(PARAMS["epochs"]), 'warmup_steps' : int(PARAMS["warmup_steps"]), 'train_batch_size': 256, 'learning_rate': PARAMS["learning_rate"]} self.transformer = PARAMS["transformer"] self.dropout = PARAMS["dropout"] def create_model(self,training, validation, tag_scheme): self.model = NERDA( dataset_training = training, dataset_validation = validation, tag_scheme = tag_scheme, tag_outside = 'O', transformer = self.transformer, dropout = self.dropout, hyperparameters = self.training_hyperparameters ) def train(self,train_folder,test_folder,validation_split, detect = False): train_sents, train_ents, valid_sents, valid_ents, test_sents, test_ents = \ self.loader_pipeline._load_corpus(train_folder, test_folder, validation_split, detect) tag_scheme = list(set(e for sent in train_ents for e in sent if e != "O")) training = {"sentences": train_sents, "tags": train_ents} validation = {"sentences": valid_sents, "tags": valid_ents} test = {"sentences": test_sents, "tags": test_ents} self.create_model(training,validation,tag_scheme) self.model.train() valid_res = self.model.predict(valid_sents) return eval_results(valid_ents, valid_res) def __call__(self, string): tokens = [] spans = [] for t,s in self.loader_pipeline(string): tokens.append(t) spans.append(s) return self.model.predict(tokens) def eval(self,tokens,labels): pred = self.model.predict(tokens) return eval_results(labels,pred)
def objective(params): print(params) model = NERDA(dataset_training=get_dane_data('train', 20), dataset_validation=get_dane_data('dev', 20), hyperparameters=params) model.train() return model.valid_loss
def create_model(self,training, validation, tag_scheme): self.model = NERDA( dataset_training = training, dataset_validation = validation, tag_scheme = tag_scheme, tag_outside = 'O', transformer = self.transformer, dropout = self.dropout, hyperparameters = self.training_hyperparameters )
def test_training_bert(): """Test if traning does not break even though MAX LEN is exceeded""" m = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), transformer='bert-base-multilingual-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) m.train()
def test_training_exceed_maxlen(): """Test if traning does not break even though MAX LEN is exceeded""" m = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), max_len=3, transformer='Maltehb/-l-ctra-danish-electra-small-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) m.train()
from NERDA.datasets import get_dane_data from NERDA.models import NERDA import nltk # instantiate a minimal model. model = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), transformer='Maltehb/-l-ctra-danish-electra-small-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) # set example texts to identify entities in. text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle" sentences = [nltk.word_tokenize(text_single)] def test_predict(): """Test that predict runs""" predictions = model.predict(sentences) predictions = model.predict(sentences) def test_predict_type(): """Test token predictions""" assert isinstance(predictions, list)
] transformer = 'bert-base-multilingual-uncased' # hyperparameters for network dropout = 0.1 # hyperparameters for training training_hyperparameters = { 'epochs' : 4, 'warmup_steps' : 500, 'train_batch_size': 13, 'learning_rate': 0.0001 } from NERDA.models import NERDA model = NERDA( dataset_training = training, dataset_validation = validation, tag_scheme = tag_scheme, tag_outside = 'O', transformer = transformer, dropout = dropout, hyperparameters = training_hyperparameters ) model.train() import nltk nltk.download('punkt') model.predict_text('Prime Minister Jacinda Ardern has claimed that New Zealand had won a big battle over the spread of coronavirus. Her words came as the country begins to exit from its lockdown')
# HACK: Filename prefixed with 'aaa' to execute this test before the others # in order to download necessary ressources for all other tests. from NERDA.datasets import get_dane_data, download_dane_data # TODO: should not be necesssary to download before importing NERDA. # Download necessary ressources download_dane_data() from NERDA.models import NERDA from NERDA.precooked import DA_ELECTRA_DA import nltk nltk.download('punkt') # instantiate a minimal model. model = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), max_len=128, transformer='Maltehb/-l-ctra-danish-electra-small-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) def test_instantiate_NERDA(): """Test that model has the correct/expected class""" assert isinstance(model, NERDA)
#model.train() #k=0 #trn['sentences'][3111] #from transformers import AutoTokenizer #t = AutoTokenizer.from_pretrained('bert-base-multilingual-uncased') #valid = get_conll_data('valid') <<<<<<< HEAD:admin/sandbox.py transformer = "google/electra-small-discriminator" from transformers import AutoTokenizer, AutoModel, AutoConfig trans = AutoConfig.from_pretrained(transformer) def tester(): try: model = AutoModel.from_pretrained('google/electra-small-discriminator') except: print("Oops!", sys.exc_info()[0], "occurred.") return model ======= from NERDA.datasets import get_dane_data trn = get_conll_data('train', 5) valid = get_conll_data('dev', 5) transformer = 'bert-base-multilingual-uncased', model = NERDA(transformer = transformer, dataset_training = trn, dataset_validation = valid) >>>>>>> b5eea087ece5f61ec70aa3f99cd4c99b418ebb92:sandbox.py
'B-trigger', 'I-trigger', 'B-participant', 'I-participant', 'B-organizer', 'B-target', 'I-target', 'B-place', 'B-etime', 'I-etime', 'B-fname', 'I-fname', 'I-organizer', 'I-place' ] transformer = "roberta-base" # bert-base-uncased bert-base-cased albert-base-v2 roberta-base max_len = max([len(sent) for sent in training["sentences"]]) model = NERDA( dataset_training=training, dataset_validation=validation, tag_scheme=tag_scheme, tag_outside="O", transformer=transformer, max_len=max_len, dropout=0.1, # Higher dropouts can be used. validation_batch_size=8, hyperparameters={ "train_batch_size": 8 } # Higher batch sizes caused issues in our training runs, but we think that it is not ideal to leave this as low as 8. ) model.train() print(model.evaluate_performance(validation, batch_size=1)) # Predict the test data and dump it to a text file. However, this may create minor format issues which we solved with our hands as post processing. res = "" for i in tqdm(range(len(testing["sentences"]))): res += testing["sentences"][i] + "\t" + model.predict(
# hyperparameters for network dropout = 0.1 # hyperparameters for training training_hyperparameters = { 'epochs' : 1,'warmup_steps' : 500,'train_batch_size': 1,'learning_rate': 0.0001 } from NERDA.models import NERDA model = NERDA( dataset_training = training, dataset_validation = validation, tag_scheme = tag_scheme, tag_outside = 'O', transformer = transformer, dropout = dropout, hyperparameters = training_hyperparameters ) model.train() # test = get_conll_data('test') # model.evaluate_performance(test) #eng train data on pl bert # Level F1-Score # 0 B-PER 0.884507 # 1 I-PER 0.941590 # 2 B-ORG 0.771963