def objective(params): print(params) model = NERDA(dataset_training=get_dane_data('train', 20), dataset_validation=get_dane_data('dev', 20), hyperparameters=params) model.train() return model.valid_loss
def test_training_bert(): """Test if traning does not break even though MAX LEN is exceeded""" m = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), transformer='bert-base-multilingual-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) m.train()
def test_training_exceed_maxlen(): """Test if traning does not break even though MAX LEN is exceeded""" m = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), max_len=3, transformer='Maltehb/-l-ctra-danish-electra-small-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) m.train()
from NERDA.datasets import get_dane_data from NERDA.models import NERDA import nltk # instantiate a minimal model. model = NERDA(dataset_training=get_dane_data('train', 5), dataset_validation=get_dane_data('dev', 5), transformer='Maltehb/-l-ctra-danish-electra-small-uncased', hyperparameters={ 'epochs': 1, 'warmup_steps': 10, 'train_batch_size': 5, 'learning_rate': 0.0001 }) # set example texts to identify entities in. text_single = "Pernille Rosenkrantz-Theil kommer fra Vejle" sentences = [nltk.word_tokenize(text_single)] def test_predict(): """Test that predict runs""" predictions = model.predict(sentences) predictions = model.predict(sentences) def test_predict_type(): """Test token predictions""" assert isinstance(predictions, list)
from NERDA.models import NERDA from NERDA.datasets import get_conll_data, get_dane_data from transformers import AutoTokenizer trans = 'bert-base-multilingual-uncased' tokenizer = AutoTokenizer.from_pretrained(trans, do_lower_case = True) data = get_dane_data('train') sents = data.get('sentences') out = [] for sent in sents: sent = sents[3595] tokens = [] for word in sent: tokens.extend(tokenizer.tokenize(word)) out.append(tokens) lens = [len(x) for x in out] max(lens) sents[3595] from transformers import AutoTokenizer, AutoModel, AutoConfig t = 'google/electra-small-discriminator' cfg = AutoModel.from_pretrained(t)