def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
    embeddingsPath = args.embeddings
    #'komninos_english_embeddings.gz'

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    pickleFile = perpareDataset(embeddingsPath, datasets)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    embeddings, mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100],
        'dropout': (0.25, 0.25)
    }
    model = BiLSTM(params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets, data)

    model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5'
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    remove_except_last_model(save_dir, model_init)
Ejemplo n.º 2
0
def run_experiment(dataset_id, dataset_dict, task, embeddings, mappings, data):
    # set network hyperparameters and mappings/datasets
    model = BiLSTM(network_params)
    model.setMappings(mappings, embeddings)
    model.setDataset(dataset_dict, data)

    # path to store the trained model and model results
    experiment_name = f'{dataset_id}_{task.lower()}'
    model.modelSavePath = models_dir / f'{experiment_name}.h5'
    model.storeResults(results_dir / f'{experiment_name}.csv')

    # build and train the model
    model.buildModel()
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping
Ejemplo n.º 3
0
def run_experiment(datasets_dict, lang, task, embeddings, mappings, data):
    # set network hyperparameters and mappings/datasets
    model = BiLSTM(network_params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets_dict, data)

    # define the experiment name
    lang_prefix = f'{lang.lower()}_' if lang is not None else ''
    task_suffix = f'_{task.lower()}' if task is not None else ''
    experiment_name = lang_prefix + 'datasets' + task_suffix

    # path to store the trained model and model results
    model.modelSavePath = models_dir / f'{experiment_name}.h5'
    model.storeResults(results_dir / f'{experiment_name}.csv')

    # build and train the model
    model.buildModel()
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping
Ejemplo n.º 4
0
def train_and_eval_model(cfg):
    """
    Load data and train model
    args:
        cfg (YACS YAML config)
    """

    # Data preprocessing
    dataset = {
        "columns": {
            0: "raw_tokens",
            1: "boundaries"
        },
        # CoNLL format (tab-delineated)
        #   Column 0: phones
        #   Column 1: syllable boundary
        "label": "boundaries",  # Which column we like to predict
    }

    # Load the embeddings and the dataset. Choose whether or not to pad the words.
    # Right now, padding must be done if CRF is chosen for output layer.
    # The CRF layer does not support masking.
    embeddings, data, mappings, vocab_size, n_class_labels, word_length = load_dataset(
        dataset, dataset_name=cfg.TRAINING.DATASET, do_pad_words=True)

    create_directory(cfg.CONFIG_NAME)
    logger.info(
        f"Starting training of `{cfg.CONFIG_NAME}` on dataset `{dataset}`")

    for training_repeat in range(cfg.TRAINING.TRAINING_REPEATS):
        model = BiLSTM(cfg)
        model.set_vocab(vocab_size, n_class_labels, word_length, mappings)
        model.set_dataset(dataset, data)

        # Path to store performance scores for dev / test
        model.store_results(PATH + "/" + cfg.CONFIG_NAME + "/" +
                            str(training_repeat) + ".csv")
        model.fit(epochs=cfg.TRAINING.EPOCHS)
# TODO Replace customClassifier dengan main task + auxiliary task
custom_classifier = {}
custom_classifier[target_task] = [('LSTM', 100), 'CRF']
for task in aux_task:
    custom_classifier[task] = ['CRF']

params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN',
    'customClassifier': custom_classifier
}

model = BiLSTM(params)

model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults("/".join(
    [args.root_dir_result, args.directory_name,
     "performance.out"]))  # Path to store performance scores for dev / test
model.predictionSavePath = "/".join([
    args.root_dir_result, args.directory_name, "predictions",
    "[ModelName]_[Data].conll"
])  # Path to store predictions
model.modelSavePath = "/".join(
    [args.root_dir_result, args.directory_name,
     "models/[ModelName].h5"])  # Path to store models
model.fit(epochs=args.nb_epoch)
}
# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically ::
embeddingsPath = 'final.txt'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'LSTM',
    'maxCharLength': 30
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
model.fit(epochs=25)
# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
# :: 数据预处理,并保存为cPickle文件
pickleFile = prepareDataset(embeddingsPath, datasets)


############################################################################################################
#
# 2.Network training
#
############################################################################################################
# :: Load the embeddings and the dataset ::
# :: 加载词向量和训练数据 ::
embeddings, mappings, data = loadDatasetPickle(pickleFile)
params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25)}

print("***** Train the model with 1 Epoch and store to disk")
model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/my_model_[Epoch].h5"
model.fit(epochs=1)

print("\n\n\n\n------------------------")
print("***** Load the model and continue training")
newModel = BiLSTM.loadModel('models/my_model_1.h5')
newModel.setDataset(datasets, data)
newModel.modelSavePath = "models/my_reloaded_model_[Epoch].h5"
newModel.fit(epochs=1)
print("***** retrained model store at "+newModel.modelSavePath)
Ejemplo n.º 8
0
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [500],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'LSTM',
    'maxCharLength': 150,
    'charEmbeddingsSize': 200,
    'charLSTMSize': 200,
    'charFilterLength': 20
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/sentiment_results.csv'
                   )  #Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"  #Path to store models
model.fit(epochs=70)
Ejemplo n.º 9
0
# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN',
    'maxCharLength': 50
}

print('#######################' + args.mod + ' #######################')
model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "/datastore/liu121/nosqldb2/emnlp_ukplab/models/[ModelName]_bbn.h5"
eval_result = model.fit(epochs=100)


def report(eval_result, filePath):
    with open(filePath, 'w+') as f:
        for key in eval_result:
            info = eval_result[key]
            f.write('====================' + key + '====================')
            f.write(info['epoch'] + '\n')
            f.write(info["per_f1"] + "\n")
            f.write(info['per_pre'] + '\n')
            f.write(info['per_recall'] + '\n')
            f.write(info["micro_f1"] + '\n')
            f.write(info["micro_pre"] + '\n')
            f.write(info["micro_recall"] + '\n')
            f.write(info["macro_f1"] + '\n')
        'evaluate': True,  # Set true always for single task setups
        'commentSymbol': None
    }  # Lines in the input data starting with this string will be skipped
}

# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
# :: 词向量文件地址,采样Komninos词向量;没有则自动下载
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
# :: 数据预处理,并保存为cPickle文件
pickleFile = prepareDataset(embeddingsPath, datasets)

############################################################################################################
#
# 2.Network training
#
############################################################################################################
# :: Load the embeddings and the dataset ::
# :: 加载词向量和训练数据 ::
embeddings, mappings, data = loadDatasetPickle(pickleFile)
params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25)}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/unidep_pos_results.csv'
                   )  # Path to store performance scores for dev/test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"  # Path to store models
model.fit(epochs=10)
Ejemplo n.º 11
0
# embeddingsPath =  'embeddings.vec'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets, useExistent=False)


######################################################
#
# The training of the network starts here
#
######################################################


#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5), 'charEmbeddings':'LSTM',
          'optimizer': 'adam', 'featureNames': ['tokens', 'casing']}


MODEL = BiLSTM(params)
MODEL.setMappings(mappings, embeddings)
MODEL.setDataset(datasets, data)
MODEL.storeResults('results/Jurica_NER.csv') #Path to store performance scores for dev / test
MODEL.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
MODEL.fit(epochs=100)



Ejemplo n.º 12
0
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN'
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data,
                 mainModelName='MIT_Restaurant')  # KHUSUS MULTITSAK

model.storeResults("/".join(
    ["results", args.directory_name,
     "performance.out"]))  #Path to store performance scores for dev / test
model.predictionSavePath = "/".join([
    "results", args.directory_name, "predictions",
    "[ModelName]_[Epoch]_[Data].conll"
])  #Path to store predictions
model.modelSavePath = "/".join([
    "results", args.directory_name,
    "models/model_[DevScore]_[TestScore]_[Epoch].h5"
])  #Path to store models
model.fit(epochs=50)
Ejemplo n.º 13
0
        'commentSymbol': None
    },
}

embeddingsPath = 'more_embedding.tsv'  #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['Softmax'],
    'LSTM-Size': [500],
    'dropout': (0.25, 0.25)
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/multi_[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
model.fit(epochs=60)
# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25)
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/quote_direct.csv'
                   )  #Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"  #Path to store models
model.fit(epochs=20)
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25),
    'charEmbeddings': 'CNN',
    'maxCharLength': 50
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
history = model.fit(epochs=30)

with open('training_history.json', 'w') as outfile:
    json.dump(history, outfile)

# # Visualize training
# import matplotlib.pyplot as plt
# hist = pd.DataFrame(history)
# plt.style.use("ggplot")
# plt.figure(figsize=(7,5))
# plt.plot(hist["epoch"], hist["dev"])
# plt.plot(hist["epoch"], hist["test"])
# plt.xlabel('Epoch')
# plt.ylabel('F1 score')
# plt.ylim(0, 1)
# plt.legend(['dev', 'test'], loc='upper left')
# :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
# :: 词向量文件地址,采样Komninos词向量;没有则自动下载
embeddingsPath = 'komninos_english_embeddings.gz'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
# :: 数据预处理,并保存为cPickle文件
pickleFile = prepareDataset(embeddingsPath, datasets)

############################################################################################################
#
# 2.Network training
#
############################################################################################################
# :: Load the embeddings and the dataset ::
# :: 加载词向量和训练数据 ::
embeddings, mappings, data = loadDatasetPickle(pickleFile)
params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100, 100],
    'dropout': (0.25, 0.25)
}

model = BiLSTM(params)
model.setMappings(mappings, embeddings)
model.setDataset(datasets, data)
model.storeResults('results/conll2000_chunking.csv'
                   )  # Path to store performance scores for dev / test
model.modelSavePath = "models/[ModelName]_[DevScore]_[TestScore]_[Epoch].h5"
model.fit(epochs=5)