def label_with_probs(self, input): #prepare input sentences = [{ 'tokens': nltk.word_tokenize(sent) } for sent in nltk.sent_tokenize(input)] addCharInformation(sentences) addCasingInformation(sentences) dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True) #tag input tags, probs = self.lstmModel.tagSentences_with_probs(dataMatrix) #prepare output result = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] sentence = [] for tokenIdx in range(len(tokens)): tokenTags = [] probTags = [] currentWord = {} for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) probTags.append(probs[modelName][sentenceIdx][tokenIdx]) currentWord['token'] = tokens[tokenIdx] currentWord['label'] = tokenTags[0] currentWord['prob'] = probTags[0] sentence.append(currentWord) result.append(sentence) return result
def run_model(file): # :: Read input :: with open(file, 'r') as f: text = f.read() # :: Prepare the input :: sentences = tokenize(text) addCharInformation(sentences) addCasingInformation(sentences) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) conll = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] for tokenIdx in range(len(tokens)): tokenTags = [] for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) conll.append( "%s\t%s\t%s" % (sentenceIdx + 1, tokens[tokenIdx], "\t".join(tokenTags))) conll.append("") conll = "\n".join(conll) output_filename = 'system_output/system-' + file.split('/')[-1] with open(output_filename, 'w') as outfile: outfile.write(conll)
def prepare_input(text): text = text.strip() pre_treated_lines, _ = pre_treat_text(text) tokenized_sentences = tokenize_text(pre_treated_lines) sentences = [{'tokens': sent} for sent in tokenized_sentences] addCharInformation(sentences) addCasingInformation(sentences) addIsNameInformation(sentences, keyword_processor=KEYWORD_PROCESSOR) data_matrix = createMatrices(sentences, MODEL.mappings, True) return data_matrix, sentences
def evaluate(args): fpath = args.model_save + '/' + args.datasetName + '_1.h5' #fpath = 'models/'+args.datasetName+'_1.h5' save_dir, model_init = os.path.split(fpath) modelPath, _ = get_last_model_path(save_dir, model_init) print(modelPath) inputPath = args.testFile inputColumns = {0: "tokens", 1: 'POS', 2: 'chunk_BIO'} resfpath = args.result_save + '/' + args.task + '/' + args.testSetting resfile = open(resfpath, 'w') # :: Load the model :: lstmModel = ELMoBiLSTM.loadModel(modelPath) # :: Prepare the input :: sentences = readCoNLL(inputPath, inputColumns) addCharInformation(sentences) addCasingInformation(sentences) # :: Map casing and character information to integer indices :: dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Perform the word embedding / ELMo embedding lookup :: embLookup = lstmModel.embeddingsLookup embLookup.elmo_cuda_device = 0 #Cuda device for pytorch - elmo embedding, -1 for CPU addEmbeddings(dataMatrix, embLookup.sentenceLookup) if (args.task == "pos"): # Evaluation of POS tagging test_acc = lstmModel.computeAcc(args.datasetName, dataMatrix) print("Test-Data: Accuracy: %.4f" % (test_acc)) resfile.write("Test-Data: Accuracy: %.4f" % (test_acc)) elif (args.task == "chunking"): # Evaluation of Chunking test_pre, test_rec, test_f1 = lstmModel.computeF1( args.datasetName, dataMatrix) print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" % (test_pre, test_rec, test_f1)) resfile.write("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" % (test_pre, test_rec, test_f1)) resfile.close()
def main(): if len(sys.argv) < 3: print("Usage: python RunModel_modified.py modelPath inputPath") exit() modelPath = sys.argv[1] inputPath = sys.argv[2] # :: Read input :: with open(inputPath, 'r') as f: text = f.read() # :: Load vocabulary for is_name features :: from flashtext import KeywordProcessor keyword_processor = KeywordProcessor() keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys())) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) # :: Prepare the input :: pre_treated_lines, _ = pre_treat_text(text) tokenized_sentences = tokenize_text(pre_treated_lines) sentences = [{'tokens': sent} for sent in tokenized_sentences] addCharInformation(sentences) addCasingInformation(sentences) addIsNameInformation(sentences, keyword_processor=keyword_processor) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] for tokenIdx in range(len(tokens)): tokenTags = [] for modelName in sorted(tags.keys()): tokenTags.append(tags[modelName][sentenceIdx][tokenIdx]) print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags))) print("")
if len(sys.argv) < 3: print( "Usage: python RunModel_CoNLL_Format.py modelPath inputPathToConllFile" ) exit() modelPath = sys.argv[1] inputPath = sys.argv[2] inputColumns = {0: "tokens", 1: "NER_BIO"} #inputColumns = {0: "tokens", 1: "is_name", 2: "NER_BIO"} # :: Prepare the input :: sentences = readCoNLL(inputPath, inputColumns) addCharInformation(sentences) addCasingInformation(sentences) # :: Load the model :: lstmModel = BiLSTM.loadModel(modelPath) dataMatrix = createMatrices(sentences, lstmModel.mappings, True) # :: Tag the input :: tags = lstmModel.tagSentences(dataMatrix) # :: Output to stdout :: all_sentences_preds = [] for sentenceIdx in range(len(sentences)): tokens = sentences[sentenceIdx]['tokens'] correct_tag = sentences[sentenceIdx]['NER_BIO'] for tokenIdx in range(len(tokens)):