def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 1 contains tokens, column 3 contains POS information
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Path on your computer to the word embeddings. Embeddings by Komninos et al. will be downloaded automatically ::
    embeddingsPath = args.embeddings
    #'komninos_english_embeddings.gz'

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    pickleFile = perpareDataset(embeddingsPath, datasets)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    embeddings, mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100],
        'dropout': (0.25, 0.25)
    }
    model = BiLSTM(params)
    model.setMappings(mappings, embeddings)
    model.setDataset(datasets, data)

    model.modelSavePath = args.model_save + '/[ModelName]_[Epoch].h5'
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    remove_except_last_model(save_dir, model_init)
prepare_training_data(datasets)

embeddingsPath = 'komninos_english_embeddings.gz'  #Word embeddings by Levy et al: https://levyomer.wordpress.com/2014/04/25/dependency-based-word-embeddings/

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasets)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
#params = {'classifier': ['CRF'], 'LSTM-Size': [100], 'dropout': (0.25, 0.25),'charEmbeddings': 'CNN',
#          'customClassifier': {'unidep_pos': ['Softmax'], 'conll2000_chunking': [('LSTM', 50), 'CRF']}}

# TODO Replace customClassifier dengan main task + auxiliary task
custom_classifier = {}
custom_classifier[target_task] = [('LSTM', 100), 'CRF']
for task in aux_task:
    custom_classifier[task] = ['CRF']

params = {
    'classifier': ['CRF'],
    'LSTM-Size': [100],
    'dropout': (0.25, 0.25),
Ejemplo n.º 3
0
datasetFiles = [
    (datasetName, dataColumns),
]

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile = perpareDataset(embeddingsPath, datasetFiles)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, word2Idx, datasets = loadDatasetPickle(pickleFile)
data = datasets[datasetName]

print("Dataset:", datasetName)
print(data['mappings'].keys())
print("Label key: ", labelKey)
print("Train Sentences:", len(data['trainMatrix']))
print("Dev Sentences:", len(data['devMatrix']))
print("Test Sentences:", len(data['testMatrix']))

model = BiLSTM(params)
model.setMappings(embeddings, data['mappings'])
model.setTrainDataset(data, labelKey)
model.verboseBuild = True
model.modelSavePath = "models/%s/%s/[DevScore]_[TestScore]_[Epoch].h5" % (
    datasetName, labelKey)  #Enable this line to save the model to the disk
Ejemplo n.º 4
0
    model_name = model_path.stem

    # obtain dataset ID and task from model name
    dataset_id, task = model_name.rsplit('_', 1)
    task = task.upper()

    # obtain dataset language from dataset ID
    lang = dataset_id.split('_')[0]
    lang = lang.upper()

    if lang not in loaded_datasets:
        # select fasttext word embeddings
        embeddings_path = embeddings_dir / f'{lang.lower()}.fasttext.oov.vec.gz'

        # load and cache the embeddings, mappings and datasets
        loaded_datasets[lang] = loadDatasetPickle(embeddings_path, lang)

    # unpack the embeddings, mappings and datasets
    embeddings, mappings, data = loaded_datasets[lang]

    # evaluate model in a separate process so that memory is released at the end
    proc_args = (model_path, dataset_id, task, evaluator, embeddings, mappings,
                 data)
    proc = Process(target=eval_single_task, args=proc_args)

    proc.start()
    proc.join()

# write the evaluation tables
evaluator.write_tables(tables_dir / 'single_task')
Ejemplo n.º 5
0
    # build and train the model
    model.buildModel()
    model.fit(
        epochs=500)  # do not limit training by epochs - use early stopping


for lang in ['PT', 'ES', None]:
    # select fasttext word embeddings
    lang_prefix = lang.lower() if lang is not None else 'es2pt'
    embeddings_path = embeddings_dir / f'{lang_prefix}.fasttext.oov.vec.gz'

    # prepare the datasets to be used with the LSTM network
    prepareDatasets(embeddings_path, lang)

    # load the embeddings and the datasets
    embeddings, mappings, data = loadDatasetPickle(embeddings_path, lang)

    # iterate through the multiple dataset combinations of language and task
    for task in ['POS', 'NER', None]:
        if lang is None and task is not None: continue
        # obtain datasets for the experiment
        datasets = Datasets(exclude=['pt_colonia'], lang=lang, task=task)
        datasets_dict = datasets.to_dict()

        # run experiment in a separate process so that memory is released at the end
        proc_args = (datasets_dict, lang, task, embeddings, mappings, data)
        proc = Process(target=run_experiment, args=proc_args)

        proc.start()
        proc.join()
        logger.info(f'Completed experiment: lang {"all" if lang is None else lang} - ' \
Ejemplo n.º 6
0
def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    embeddings_file = None
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights
    elmo_mode = 'weighted_average'
    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    elmo_cuda_device = args.cuda_device  #Which GPU to use. -1 for CPU

    embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                                   elmo_weight_file, elmo_mode,
                                   elmo_cuda_device)
    # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
    embLookup.loadCache(args.pkl_path)

    pickleFile = perpareDataset(datasets, embLookup)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.5, 0.5)
    }

    model = ELMoBiLSTM(embLookup, params)
    model.setMappings(mappings)
    model.setDataset(datasets, data)
    model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5"
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    # remove trained files except from the last file
    remove_except_last_model(save_dir, model_init)
Ejemplo n.º 7
0
# :: Path on your computer to the word embeddings. Embeddings by Reimers et al. will be downloaded automatically ::
embeddingsPath = '/datastore/liu121/nosqldb2/emnlp_ukplab/skipgram'

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
pickleFile_train, pickleFile_dev, pickleFile_test = perpareDataset(
    embeddingsPath, datasets, args.k_shot)
print('data prepare successful: %s, %s, and %s' %
      (pickleFile_train, pickleFile_dev, pickleFile_test))
######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
embeddings, mappings, data_train = loadDatasetPickle(pickleFile_train)
embeddings, mappings, data_dev = loadDatasetPickle(pickleFile_dev)
embeddings, mappings, data_test = loadDatasetPickle(pickleFile_test)

# print('mappings type: ',type(mappings))
# for key in mappings:
#     print(key)
#     print(mappings[key])
#     print('===============')

# print('embeddings type:',type(data))
# for key in data:
#     print(key)
#     for subkey in data[key]:
#         print('--',subkey)
# for subsubkey in data[key][subkey]: