def __init__(self, path): print(path) self.lstmModel = BiLSTM.loadModel("models/" + path) try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt')
def eval_single_task(model_path, dataset_id, task, evaluator, embeddings, mappings, data): # load the BiLSTM model model = BiLSTM.loadModel(model_path) # create dataset dictionary dataset = Dataset(dataset_id) dataset_dict = dataset.to_dict(task) # set the model mappings and datasets model.setMappings(mappings, embeddings) model.setDataset(dataset_dict, data) # obtain mapping of indices to POS/NER labels label = task + '_BIO' if task == 'NER' else task idx2label = model.idx2Labels[label] # obtain train and test data train_data = data[dataset_id]['trainMatrix'] test_data = data[dataset_id]['testMatrix'] # obtain correct and predicted sentences corr_idxs = [sentence[label] for sentence in test_data] pred_idxs = model.predictLabels(test_data)[label] # convert indices to labels (POS tags or NER tags in BIO format) corr_labels = [[idx2label[idx] for idx in sent] for sent in corr_idxs] pred_labels = [[idx2label[idx] for idx in sent] for sent in pred_idxs] evaluator.eval(dataset.name, dataset.lang, task, corr_labels, pred_labels, train_data, test_data) print(f'Evaluated single_task - {dataset_id} - {task}')
def __init__(self, path): print("Init Model") try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') if (self.lstmModel is None): self.lstmModel = BiLSTM.loadModel(path)
def eval_multi_task(model_path, lang, task, evaluators, embeddings, mappings, data): # load the BiLSTM model model = BiLSTM.loadModel(model_path) print(f'Loaded model {model_path}') # obtain the evaluator based on the transfer setting if model_path.parent.name == 'single_task': transfer_setting = 'out_of_domain' elif lang is not None and task is not None: transfer_setting = 'cross_domain' elif lang is not None and task is None: transfer_setting = 'multi_task' elif lang is None and task is None: transfer_setting = 'cross_lingual' else: raise ValueError('Unknown transfer setting') evaluator = evaluators[transfer_setting] # create datasets dictionary datasets = Datasets(lang=lang, task=task) datasets_dict = datasets.to_dict() # set the model mappings and datasets model.setMappings(mappings, embeddings) model.setDataset(datasets_dict, data) # evaluate each dataset separately for dataset_id, dataset in datasets: # obtain train and test data train_data = data[dataset_id]['trainMatrix'] test_data = data[dataset_id]['testMatrix'] # predict labels for the POS and NER tasks task_predictions = model.predictLabels(test_data) # iterate through the available output tasks for label in model.labelKeys[dataset_id]: # obtain mapping of indices to POS/NER labels task = label.replace('_BIO', '') idx2label = model.idx2Labels[label] # obtain correct and predicted sentences corr_idxs = [sentence[label] for sentence in test_data] pred_idxs = task_predictions[label] # convert indices to labels (POS tags or NER tags in BIO format) corr_labels = [[idx2label[idx] for idx in sent] for sent in corr_idxs] pred_labels = [[idx2label[idx] for idx in sent] for sent in pred_idxs] evaluator.eval(dataset.name, dataset.lang, task, corr_labels, pred_labels, train_data, test_data) print(f'Evaluated {transfer_setting} - {dataset_id} - {task}')
def load_model(): # load the pre-trained Keras model global MODEL global TRAINING_TAGS MODEL = BiLSTM.loadModel("./model/rest_model.h5") loaded_model = list(MODEL.models.values())[0] loaded_model._make_predict_function() # This is to avoid troubles with Flask's multiple threads TRAINING_TAGS = get_model_tags(MODEL) logging.info("Loaded NER model...")
def main(): if len(sys.argv) < 3: print("Usage: python RunModel_modified.py modelPath inputPath") exit() modelPath = sys.argv[1] inputPath = sys.argv[2] # :: Read input :: with open(inputPath, 'r') as f: text = f.read() # :: Load vocabulary for is_name features :: from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) # :: Prepare the input :: pre_treated_lines, _ = pre_treat_text(text) tokenized_sentences = tokenize_text(pre_treated_lines) sentences = [{'tokens': sent} for sent in tokenized_sentences] addCharInformation(sentences) addCasingInformation(sentences) addIsNameInformation(sentences, keyword_processor=keyword_processor) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] for tokenIdx in range(len(tokens)): tokenTags = [] for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags))) print("")
def evaluate(args): fpath = args.model_save + '/' + args.datasetName + '_1.h5' #fpath = 'models/'+args.datasetName+'_1.h5' save_dir, model_init = os.path.split(fpath) modelPath, _ = get_last_model_path(save_dir, model_init) print(modelPath) inputPath = args.testFile inputColumns = {0: "tokens", 1: 'POS', 2: 'chunk_BIO'} resfpath = args.result_save + '/' + args.task + '/' + args.testSetting resfile = open(resfpath, 'w') # :: Load the model :: #lstmModel = ELMoBiLSTM.loadModel(modelPath) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) # :: Prepare the input :: sentences = readCoNLL(inputPath, inputColumns) addCharInformation(sentences) addCasingInformation(sentences) # :: Map casing and character information to integer indices :: dataMatrix = createMatrices(sentences, lstmModel.mappings, True) if (args.task == "pos"): # Evaluation of POS tagging test_acc = lstmModel.computeAcc(args.datasetName, dataMatrix) print("Test-Data: Accuracy: %.4f" % (test_acc)) resfile.write("Test-Data: Accuracy: %.4f" % (test_acc)) elif (args.task == "chunking"): # Evaluation of Chunking test_pre, test_rec, test_f1 = lstmModel.computeF1( args.datasetName, dataMatrix) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" % (test_pre, test_rec, test_f1)) resfile.write("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" % (test_pre, test_rec, test_f1)) resfile.close()
def run_experiment(dataset_id, dataset_dict, lang, task, data): # load the pre-trained BiLSTM model lang_prefix = f'{lang.lower()}_' if lang is not None else '' model = BiLSTM.loadModel(multi_task_models_dir / f'{lang_prefix}datasets.h5') # set the single task dataset and select both tasks model.setDataset(dataset_dict, data) model.tasks = ['POS', 'NER_BIO'] # path to store the trained model and model results experiment_name = f'{dataset_id}_{task.lower()}' pretrain_type = 'multi_task' if lang is not None else 'cross_lingual' model.modelSavePath = models_dir / f'pretrain_{pretrain_type}/{experiment_name}.h5' model.storeResults(results_dir / f'pretrain_{pretrain_type}/{experiment_name}.csv') # train the model - no need to build model here model.fit( epochs=500) # do not limit training by epochs - use early stopping
"Usage: python RunModel_CoNLL_Format.py modelPath inputPathToConllFile" ) exit() modelPath = sys.argv[1] inputPath = sys.argv[2] inputColumns = {0: "tokens", 1: "NER_BIO"} #inputColumns = {0: "tokens", 1: "is_name", 2: "NER_BIO"} # :: Prepare the input :: sentences = readCoNLL(inputPath, inputColumns) addCharInformation(sentences) addCasingInformation(sentences) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: all_sentences_preds = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] correct_tag = sentences[sentenceIdx]['NER_BIO'] for tokenIdx in range(len(tokens)): tokenTags = [] for modelName in sorted(tags.keys()): tokenTags.append(correct_tag[tokenIdx]) # Predicted tag
class ModelIBM: lstmModel = BiLSTM.loadModel(modelPath) def __init__(self): try: nltk.data.find('tokenizers/punkt') except LookupError: nltk.download('punkt') def label(self, input): #prepare input sentences = [{ 'tokens': nltk.word_tokenize(sent) } for sent in nltk.sent_tokenize(input)] addCharInformation(sentences) addCasingInformation(sentences) dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True) #tag input tags = self.lstmModel.tagSentences(dataMatrix) #prepare output result = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] sentence = [] for tokenIdx in range(len(tokens)): tokenTags = [] currentWord = {} for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) currentWord['token'] = tokens[tokenIdx] currentWord['label'] = tokenTags[0] sentence.append(currentWord) result.append(sentence) return json.dumps(result) def label_with_probs(self, input): #prepare input sentences = [{ 'tokens': nltk.word_tokenize(sent) } for sent in nltk.sent_tokenize(input)] addCharInformation(sentences) addCasingInformation(sentences) dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True) #tag input tags, probs = self.lstmModel.tagSentences_with_probs(dataMatrix) #prepare output result = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] sentence = [] for tokenIdx in range(len(tokens)): tokenTags = [] probTags = [] currentWord = {} for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) probTags.append(probs[modelName][sentenceIdx][tokenIdx]) currentWord['token'] = tokens[tokenIdx] currentWord['label'] = tokenTags[0] currentWord['prob'] = probTags[0] sentence.append(currentWord) result.append(sentence) return result
# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: # :: 数据预处理,并保存为cPickle文件 pickleFile = prepareDataset(embeddingsPath, datasets) ############################################################################################################ # # 2.Network training # ############################################################################################################ # :: Load the embeddings and the dataset :: # :: 加载词向量和训练数据 :: embeddings, mappings, data = loadDatasetPickle(pickleFile) params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25)} print("***** Train the model with 1 Epoch and store to disk") model = BiLSTM(params) model.setMappings(mappings, embeddings) model.setDataset(datasets, data) model.modelSavePath = "models/my_model_[Epoch].h5" model.fit(epochs=1) print("\n\n\n\n------------------------") print("***** Load the model and continue training") newModel = BiLSTM.loadModel('models/my_model_1.h5') newModel.setDataset(datasets, data) newModel.modelSavePath = "models/my_reloaded_model_[Epoch].h5" newModel.fit(epochs=1) print("***** retrained model store at "+newModel.modelSavePath)
from __future__ import print_function import os import logging import sys from neuralnets.BiLSTM import BiLSTM from util.preprocessing import perpareDataset, loadDatasetPickle # :: Change into the working dir of the script :: abspath = os.path.abspath(__file__) dname = os.path.dirname(abspath) os.chdir(dname) # :: Logging level :: loggingLevel = logging.INFO logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) bilstm = BiLSTM.loadModel('results_emnlp/SCRATCH_2_4/models/ATIS.h5') print(bilstm.models['ATIS'].summary())
} # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically :: embeddingsPath = 'more_embedding.tsv' # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: pickleFile = perpareDataset(embeddingsPath, datasets) ###################################################### # # The training of the network starts here # ###################################################### modelPath = sys.argv[1] #Load the embeddings and the dataset embeddings, mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters print("\n\n\n\n------------------------") print("Load the model and continue training") newModel = BiLSTM.loadModel(modelPath) print('load model ' + modelPath) newModel.setDataset(datasets, data) newModel.params['earlyStopping'] = 25 newModel.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5" newModel.fit(epochs=70) print("retrained model store at " + newModel.modelSavePath)
if len(sys.argv) < 3: print( "Usage: python3 runModel_singleOutput.py modelPath inputPath outputPath" ) exit() modelPath = sys.argv[1] inputPath = sys.argv[2] outputPath = sys.argv[3] if not os.path.exists(outputPath): os.makedirs(outputPath) # :: Load the model :: lstmModel = BiLSTM() lstmModel.loadModel(modelPath) for textName in os.listdir(inputPath): with open(inputPath + "/" + textName, 'r') as f: text = f.read() # :: Prepare the input :: sentences = [{'tokens': nltk.word_tokenize(text)}] #addCharInformation(sentences) addCasingInformation(sentences) dataMatrix = createMatrices(sentences, lstmModel.mappings) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix)
#if len(sys.argv) < 4: # print("Usage: python RunModel.py modelPath inputPathToConllFile outputPathToConllFile") # exit() #modelPath = sys.argv[1] #inputPath = sys.argv[2] #outputPath = sys.argv[3] inputColumns = {0: "tokens", 1: "gold"} # :: Prepare the input :: sentences = readCoNLL(args.input_file, inputColumns) addCharInformation(sentences) addCasingInformation(sentences) # :: Load the model :: lstmModel = BiLSTM.loadModel(args.model_path) params = lstmModel.get_params() #print("params : {}".format(params)) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: f = None if args.output_file is not None: f = open(args.output_file, "w") for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens']