def eval(dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', use_ELMo=False, use_BERT=False, data_path=None): if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL-2003 NER data...') x_test, y_test = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_eval=x_test, y_eval=y_test) # load model model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_test, y_test = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_eval=x_test, y_eval=y_test) # load model model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name) model.load() else: print("dataset/language combination is not supported for fixed eval:", dataset_type, lang) return start_time = time.time() print("\nEvaluation on test set:") model.eval(x_test, y_test) runtime = round(time.time() - start_time, 3) print("runtime: %s seconds " % (runtime))
def train(dataset_type='conll2003', lang='en', embeddings_name=None, architecture='BidLSTM_CRF', transformer=None, data_path=None, use_ELMo=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') # we concatenate all sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2003-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') # we concatenate train and valid sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2012-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') shuffle_arrays([x_all, y_all]) x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-fr-lemonde-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: print("dataset/language combination is not supported:", dataset_type, lang) return #elif (dataset_type == 'ontonotes') and (lang == 'en'): # model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embeddings_name) #elif (lang == 'fr'): # model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embeddings_name) start_time = time.time() model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model model.save()
def train_eval(embeddings_name=None, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', transformer=None, fold_count=1, train_with_validation_set=False, data_path=None, use_ELMo=False): batch_size, max_sequence_length, patience, recurrent_dropout, early_stop, max_epoch, embeddings_name, word_lstm_units, multiprocessing = \ configure(architecture, dataset_type, lang, embeddings_name, use_ELMo) if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2003-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=True, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'ontonotes-all') and (lang == 'en'): print( "Loading all Ontonotes 5.0 XML data, evaluation will be on 10\% random partition" ) x_all, y_all = load_data_and_labels_ontonotes(data_path) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-ontonotes-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2012-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=True, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparameters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb' or dataset_type is None): print('Loading data for ftb...') x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') shuffle_arrays([x_all, y_all]) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-' + architecture if use_ELMo: model_name += '-with_ELMo' model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, early_stop=early_stop, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb_force_split'): print('Loading data for ftb_force_split...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_train.conll') shuffle_arrays([x_train, y_train]) x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_dev.conll') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/leMonde/ftb6_test.conll') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-force-split-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=True, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) elif (lang == 'fr') and (dataset_type == 'ftb_force_split_xml'): print('Loading data for ftb_force_split_xml...') x_train, y_train = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.train.xml' ) shuffle_arrays([x_train, y_train]) x_valid, y_valid = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.dev.xml') x_eval, y_eval = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.test.xml' ) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde-force-split-xml-' + architecture if use_ELMo: model_name += '-with_ELMo' if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=True, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embeddings_name, early_stop=False, fold_number=fold_count, architecture=architecture, transformer_name=transformer, word_lstm_units=word_lstm_units, batch_size=batch_size, patience=patience, max_sequence_length=max_sequence_length, use_ELMo=use_ELMo, multiprocessing=multiprocessing) else: print("dataset/language combination is not supported:", dataset_type, lang) return start_time = time.time() if fold_count == 1: model.train(x_train, y_train, x_valid=x_valid, y_valid=y_valid) else: model.train_nfold(x_train, y_train, x_valid=x_valid, y_valid=y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) print("\nEvaluation on test set:") model.eval(x_eval, y_eval) # # saving the model (must be called after eval for multiple fold training) model.save()
def train_eval(embedding_name, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', fold_count=1, train_with_validation_set=False, use_ELMo=False, use_BERT=False, data_path=None): if (architecture == "BidLSTM_CNN_CRF"): word_lstm_units = 200 max_epoch = 30 recurrent_dropout = 0.5 else: word_lstm_units = 100 max_epoch = 25 recurrent_dropout = 0.5 if use_ELMo or use_BERT: batch_size = 120 else: batch_size = 20 if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading CoNLL 2003 data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture if not train_with_validation_set: # restrict training on train set, use validation set for early stop, as in most papers model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results (~ +0.5 f1 score with CoNLL-2003) model = Sequence(model_name, max_epoch=max_epoch, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=False, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'ontonotes-all') and (lang == 'en'): print('Loading Ontonotes 5.0 XML data...') x_all, y_all = load_data_and_labels_ontonotes(data_path) x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-ontonotes' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train, y_train = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_valid, y_valid = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_eval, y_eval = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture if not train_with_validation_set: model = Sequence(model_name, max_epoch=80, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: # also use validation set to train (no early stop, hyperparmeters must be set preliminarly), # as (Chui & Nochols, 2016) and (Peters and al., 2017) # this leads obviously to much higher results model = Sequence(model_name, max_epoch=40, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=False, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') x_train_all, x_eval, y_train_all, y_eval = train_test_split( x_all, y_all, test_size=0.1) x_train, x_valid, y_train, y_valid = train_test_split(x_train_all, y_train_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid, x_eval, y_eval) model_name = 'ner-fr-lemonde' if use_ELMo: model_name += '-with_ELMo' # custom batch size for French ELMo batch_size = 20 elif use_BERT: # need to find a French BERT :/ model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, early_stop=True, fold_number=fold_count, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: print("dataset/language combination is not supported:", dataset_type, lang) return start_time = time.time() if fold_count == 1: model.train(x_train, y_train, x_valid, y_valid) else: model.train_nfold(x_train, y_train, x_valid, y_valid, fold_number=fold_count) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) print("\nEvaluation on test set:") model.eval(x_eval, y_eval) # saving the model model.save()
def train(embedding_name, dataset_type='conll2003', lang='en', architecture='BidLSTM_CRF', use_ELMo=False, use_BERT=False, data_path=None): if (architecture == "BidLSTM_CNN_CRF"): word_lstm_units = 200 recurrent_dropout = 0.5 else: word_lstm_units = 100 recurrent_dropout = 0.5 if use_ELMo: batch_size = 100 else: batch_size = 20 if (dataset_type == 'conll2003') and (lang == 'en'): print('Loading data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testa') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2003/eng.testb') # we concatenate all sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2003' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (dataset_type == 'conll2012') and (lang == 'en'): print('Loading Ontonotes 5.0 CoNLL-2012 NER data...') x_train1, y_train1 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.train') x_train2, y_train2 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.dev') x_train3, y_train3 = load_data_and_labels_conll( 'data/sequenceLabelling/CoNLL-2012-NER/eng.test') # we concatenate train and valid sets x_all = np.concatenate((x_train1, x_train2, x_train3), axis=0) y_all = np.concatenate((y_train1, y_train2, y_train3), axis=0) # split train and valid sets in a random way x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-en-conll2012' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=80, recurrent_dropout=0.20, embeddings_name=embedding_name, early_stop=True, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) elif (lang == 'fr'): print('Loading data...') dataset_type = 'lemonde' x_all, y_all = load_data_and_labels_lemonde( 'data/sequenceLabelling/leMonde/ftb6_ALL.EN.docs.relinked.xml') x_train, x_valid, y_train, y_valid = train_test_split(x_all, y_all, test_size=0.1) stats(x_train, y_train, x_valid, y_valid) model_name = 'ner-fr-lemonde' if use_ELMo: model_name += '-with_ELMo' elif use_BERT: model_name += '-with_BERT' model_name += '-' + architecture model = Sequence(model_name, max_epoch=60, recurrent_dropout=recurrent_dropout, embeddings_name=embedding_name, model_type=architecture, word_lstm_units=word_lstm_units, batch_size=batch_size, use_ELMo=use_ELMo, use_BERT=use_BERT) else: print("dataset/language combination is not supported:", dataset_type, lang) return #elif (dataset_type == 'ontonotes') and (lang == 'en'): # model = sequenceLabelling.Sequence('ner-en-ontonotes', max_epoch=60, embeddings_name=embedding_name) #elif (lang == 'fr'): # model = sequenceLabelling.Sequence('ner-fr-lemonde', max_epoch=60, embeddings_name=embedding_name) start_time = time.time() model.train(x_train, y_train, x_valid, y_valid) runtime = round(time.time() - start_time, 3) print("training runtime: %s seconds " % (runtime)) # saving the model model.save()