Ejemplo n.º 1
0
 def __init__(self, path):
     print(path)
     self.lstmModel = BiLSTM.loadModel("models/" + path)
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')
Ejemplo n.º 2
0
def eval_single_task(model_path, dataset_id, task, evaluator, embeddings,
                     mappings, data):
    # load the BiLSTM model
    model = BiLSTM.loadModel(model_path)

    # create dataset dictionary
    dataset = Dataset(dataset_id)
    dataset_dict = dataset.to_dict(task)

    # set the model mappings and datasets
    model.setMappings(mappings, embeddings)
    model.setDataset(dataset_dict, data)

    # obtain mapping of indices to POS/NER labels
    label = task + '_BIO' if task == 'NER' else task
    idx2label = model.idx2Labels[label]

    # obtain train and test data
    train_data = data[dataset_id]['trainMatrix']
    test_data = data[dataset_id]['testMatrix']

    # obtain correct and predicted sentences
    corr_idxs = [sentence[label] for sentence in test_data]
    pred_idxs = model.predictLabels(test_data)[label]

    # convert indices to labels (POS tags or NER tags in BIO format)
    corr_labels = [[idx2label[idx] for idx in sent] for sent in corr_idxs]
    pred_labels = [[idx2label[idx] for idx in sent] for sent in pred_idxs]

    evaluator.eval(dataset.name, dataset.lang, task, corr_labels, pred_labels,
                   train_data, test_data)
    print(f'Evaluated single_task - {dataset_id} - {task}')
Ejemplo n.º 3
0
 def __init__(self, path):
     print("Init Model")
     try:
         nltk.data.find('tokenizers/punkt')
     except LookupError:
         nltk.download('punkt')
     if (self.lstmModel is None):
         self.lstmModel = BiLSTM.loadModel(path)
Ejemplo n.º 4
0
def eval_multi_task(model_path, lang, task, evaluators, embeddings, mappings,
                    data):
    # load the BiLSTM model
    model = BiLSTM.loadModel(model_path)
    print(f'Loaded model {model_path}')

    # obtain the evaluator based on the transfer setting
    if model_path.parent.name == 'single_task':
        transfer_setting = 'out_of_domain'
    elif lang is not None and task is not None:
        transfer_setting = 'cross_domain'
    elif lang is not None and task is None:
        transfer_setting = 'multi_task'
    elif lang is None and task is None:
        transfer_setting = 'cross_lingual'
    else:
        raise ValueError('Unknown transfer setting')

    evaluator = evaluators[transfer_setting]

    # create datasets dictionary
    datasets = Datasets(lang=lang, task=task)
    datasets_dict = datasets.to_dict()

    # set the model mappings and datasets
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets_dict, data)

    # evaluate each dataset separately
    for dataset_id, dataset in datasets:
        # obtain train and test data
        train_data = data[dataset_id]['trainMatrix']
        test_data = data[dataset_id]['testMatrix']

        # predict labels for the POS and NER tasks
        task_predictions = model.predictLabels(test_data)

        # iterate through the available output tasks
        for label in model.labelKeys[dataset_id]:
            # obtain mapping of indices to POS/NER labels
            task = label.replace('_BIO', '')
            idx2label = model.idx2Labels[label]

            # obtain correct and predicted sentences
            corr_idxs = [sentence[label] for sentence in test_data]
            pred_idxs = task_predictions[label]

            # convert indices to labels (POS tags or NER tags in BIO format)
            corr_labels = [[idx2label[idx] for idx in sent]
                           for sent in corr_idxs]
            pred_labels = [[idx2label[idx] for idx in sent]
                           for sent in pred_idxs]

            evaluator.eval(dataset.name, dataset.lang, task, corr_labels,
                           pred_labels, train_data, test_data)
            print(f'Evaluated {transfer_setting} - {dataset_id} - {task}')
Ejemplo n.º 5
0
def load_model():
    # load the pre-trained Keras model
    global MODEL
    global TRAINING_TAGS

    MODEL = BiLSTM.loadModel("./model/rest_model.h5")
    loaded_model = list(MODEL.models.values())[0]
    loaded_model._make_predict_function()  # This is to avoid troubles with  Flask's  multiple threads
    TRAINING_TAGS = get_model_tags(MODEL)
    logging.info("Loaded NER model...")
def main():
    if len(sys.argv) < 3:
        print("Usage: python RunModel_modified.py modelPath inputPath")
        exit()

    modelPath = sys.argv[1]
    inputPath = sys.argv[2]

    # :: Read input ::
    with open(inputPath, 'r') as f:
        text = f.read()

    # :: Load vocabulary for is_name features ::
    from flashtext import KeywordProcessor
    keyword_processor = KeywordProcessor()
    keyword_processor.add_keywords_from_list(list(load_names(FR_NAMES_PATH).keys()))

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    pre_treated_lines, _ = pre_treat_text(text)
    tokenized_sentences = tokenize_text(pre_treated_lines)
    sentences = [{'tokens': sent} for sent in tokenized_sentences]
    addCharInformation(sentences)
    addCasingInformation(sentences)
    addIsNameInformation(sentences, keyword_processor=keyword_processor)
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)

    # :: Output to stdout ::
    for sentenceIdx in range(len(sentences)):
        tokens = sentences[sentenceIdx]['tokens']

        for tokenIdx in range(len(tokens)):
            tokenTags = []
            for modelName in sorted(tags.keys()):
                tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

            print("%s\t%s" % (tokens[tokenIdx], "\t".join(tokenTags)))
        print("")
def evaluate(args):
    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    #fpath = 'models/'+args.datasetName+'_1.h5'
    save_dir, model_init = os.path.split(fpath)

    modelPath, _ = get_last_model_path(save_dir, model_init)
    print(modelPath)
    inputPath = args.testFile
    inputColumns = {0: "tokens", 1: 'POS', 2: 'chunk_BIO'}

    resfpath = args.result_save + '/' + args.task + '/' + args.testSetting
    resfile = open(resfpath, 'w')

    # :: Load the model ::
    #lstmModel = ELMoBiLSTM.loadModel(modelPath)

    # :: Load the model ::
    lstmModel = BiLSTM.loadModel(modelPath)

    # :: Prepare the input ::
    sentences = readCoNLL(inputPath, inputColumns)
    addCharInformation(sentences)
    addCasingInformation(sentences)

    # :: Map casing and character information to integer indices ::
    dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

    if (args.task == "pos"):
        # Evaluation of POS tagging
        test_acc = lstmModel.computeAcc(args.datasetName, dataMatrix)
        print("Test-Data: Accuracy: %.4f" % (test_acc))
        resfile.write("Test-Data: Accuracy: %.4f" % (test_acc))
    elif (args.task == "chunking"):
        # Evaluation of Chunking
        test_pre, test_rec, test_f1 = lstmModel.computeF1(
            args.datasetName, dataMatrix)
        print("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
              (test_pre, test_rec, test_f1))
        resfile.write("Test-Data: Prec: %.3f, Rec: %.3f, F1: %.4f" %
                      (test_pre, test_rec, test_f1))

    resfile.close()
Ejemplo n.º 8
0
def run_experiment(dataset_id, dataset_dict, lang, task, data):
    # load the pre-trained BiLSTM model
    lang_prefix = f'{lang.lower()}_' if lang is not None else ''
    model = BiLSTM.loadModel(multi_task_models_dir /
                             f'{lang_prefix}datasets.h5')

    # set the single task dataset and select both tasks
    model.setDataset(dataset_dict, data)
    model.tasks = ['POS', 'NER_BIO']

    # path to store the trained model and model results
    experiment_name = f'{dataset_id}_{task.lower()}'
    pretrain_type = 'multi_task' if lang is not None else 'cross_lingual'

    model.modelSavePath = models_dir / f'pretrain_{pretrain_type}/{experiment_name}.h5'
    model.storeResults(results_dir /
                       f'pretrain_{pretrain_type}/{experiment_name}.csv')

    # train the model - no need to build model here
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping
        "Usage: python RunModel_CoNLL_Format.py modelPath inputPathToConllFile"
    )
    exit()

modelPath = sys.argv[1]
inputPath = sys.argv[2]
inputColumns = {0: "tokens", 1: "NER_BIO"}
#inputColumns = {0: "tokens", 1: "is_name", 2: "NER_BIO"}

# :: Prepare the input ::
sentences = readCoNLL(inputPath, inputColumns)
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Load the model ::
lstmModel = BiLSTM.loadModel(modelPath)

dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

# :: Output to stdout ::
all_sentences_preds = []
for sentenceIdx in range(len(sentences)):
    tokens = sentences[sentenceIdx]['tokens']
    correct_tag = sentences[sentenceIdx]['NER_BIO']
    for tokenIdx in range(len(tokens)):
        tokenTags = []
        for modelName in sorted(tags.keys()):
            tokenTags.append(correct_tag[tokenIdx])  # Predicted tag
Ejemplo n.º 10
0
class ModelIBM:

    lstmModel = BiLSTM.loadModel(modelPath)

    def __init__(self):
        try:
            nltk.data.find('tokenizers/punkt')
        except LookupError:
            nltk.download('punkt')

    def label(self, input):
        #prepare input
        sentences = [{
            'tokens': nltk.word_tokenize(sent)
        } for sent in nltk.sent_tokenize(input)]
        addCharInformation(sentences)
        addCasingInformation(sentences)
        dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True)

        #tag input
        tags = self.lstmModel.tagSentences(dataMatrix)

        #prepare output
        result = []
        for sentenceIdx in range(len(sentences)):
            tokens = sentences[sentenceIdx]['tokens']
            sentence = []
            for tokenIdx in range(len(tokens)):
                tokenTags = []
                currentWord = {}
                for modelName in sorted(tags.keys()):
                    tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])

                currentWord['token'] = tokens[tokenIdx]
                currentWord['label'] = tokenTags[0]
                sentence.append(currentWord)
            result.append(sentence)

        return json.dumps(result)

    def label_with_probs(self, input):
        #prepare input
        sentences = [{
            'tokens': nltk.word_tokenize(sent)
        } for sent in nltk.sent_tokenize(input)]
        addCharInformation(sentences)
        addCasingInformation(sentences)
        dataMatrix = createMatrices(sentences, self.lstmModel.mappings, True)

        #tag input
        tags, probs = self.lstmModel.tagSentences_with_probs(dataMatrix)

        #prepare output
        result = []
        for sentenceIdx in range(len(sentences)):
            tokens = sentences[sentenceIdx]['tokens']
            sentence = []
            for tokenIdx in range(len(tokens)):
                tokenTags = []
                probTags = []
                currentWord = {}
                for modelName in sorted(tags.keys()):
                    tokenTags.append(tags[modelName][sentenceIdx][tokenIdx])
                    probTags.append(probs[modelName][sentenceIdx][tokenIdx])

                currentWord['token'] = tokens[tokenIdx]
                currentWord['label'] = tokenTags[0]
                currentWord['prob'] = probTags[0]
                sentence.append(currentWord)
            result.append(sentence)

        return result
# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
# :: 数据预处理,并保存为cPickle文件
pickleFile = prepareDataset(embeddingsPath, datasets)


############################################################################################################
#
# 2.Network training
#
############################################################################################################
# :: Load the embeddings and the dataset ::
# :: 加载词向量和训练数据 ::
embeddings, mappings, data = loadDatasetPickle(pickleFile)
params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25)}

print("***** Train the model with 1 Epoch and store to disk")
model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/my_model_[Epoch].h5"
model.fit(epochs=1)

print("\n\n\n\n------------------------")
print("***** Load the model and continue training")
newModel = BiLSTM.loadModel('models/my_model_1.h5')
newModel.setDataset(datasets, data)
newModel.modelSavePath = "models/my_reloaded_model_[Epoch].h5"
newModel.fit(epochs=1)
print("***** retrained model store at "+newModel.modelSavePath)
Ejemplo n.º 12
0
from __future__ import print_function
import os
import logging
import sys
from neuralnets.BiLSTM import BiLSTM
from util.preprocessing import perpareDataset, loadDatasetPickle

# :: Change into the working dir of the script ::
abspath = os.path.abspath(__file__)
dname = os.path.dirname(abspath)
os.chdir(dname)

# :: Logging level ::
loggingLevel = logging.INFO
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

bilstm = BiLSTM.loadModel('results_emnlp/SCRATCH_2_4/models/ATIS.h5')
print(bilstm.models['ATIS'].summary())
Ejemplo n.º 13
0
}

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'more_embedding.tsv'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

modelPath = sys.argv[1]

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)
# Some network hyperparameters

print("\n\n\n\n------------------------")
print("Load the model and continue training")
newModel = BiLSTM.loadModel(modelPath)
print('load model ' + modelPath)
newModel.setDataset(datasets, data)
newModel.params['earlyStopping'] = 25
newModel.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
newModel.fit(epochs=70)

print("retrained model store at " + newModel.modelSavePath)
if len(sys.argv) < 3:
    print(
        "Usage: python3 runModel_singleOutput.py modelPath inputPath outputPath"
    )
    exit()

modelPath = sys.argv[1]
inputPath = sys.argv[2]
outputPath = sys.argv[3]

if not os.path.exists(outputPath):
    os.makedirs(outputPath)

# :: Load the model ::
lstmModel = BiLSTM()
lstmModel.loadModel(modelPath)

for textName in os.listdir(inputPath):
    with open(inputPath + "/" + textName, 'r') as f:
        text = f.read()

# :: Prepare the input ::
    sentences = [{'tokens': nltk.word_tokenize(text)}]
    #addCharInformation(sentences)
    addCasingInformation(sentences)

    dataMatrix = createMatrices(sentences, lstmModel.mappings)

    # :: Tag the input ::
    tags = lstmModel.tagSentences(dataMatrix)
#if len(sys.argv) < 4:
#    print("Usage: python RunModel.py modelPath inputPathToConllFile outputPathToConllFile")
#    exit()

#modelPath = sys.argv[1]
#inputPath = sys.argv[2]
#outputPath = sys.argv[3]
inputColumns = {0: "tokens", 1: "gold"}

# :: Prepare the input ::
sentences = readCoNLL(args.input_file, inputColumns)
addCharInformation(sentences)
addCasingInformation(sentences)

# :: Load the model ::
lstmModel = BiLSTM.loadModel(args.model_path)
params = lstmModel.get_params()
#print("params : {}".format(params))

dataMatrix = createMatrices(sentences, lstmModel.mappings, True)

# :: Tag the input ::
tags = lstmModel.tagSentences(dataMatrix)

# :: Output to stdout ::
f = None
if args.output_file is not None:
    f = open(args.output_file, "w")

for sentenceIdx in range(len(sentences)):
    tokens = sentences[sentenceIdx]['tokens']