Beispiel #1
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::

embeddings_file = 'embeddings/bioEmbeddings.txt'
elmo_options_file = ''
elmo_weight_file = ''
elmo_mode = 'average'
elmo_cuda_device = 0  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See for an example.

pickleFile = perpareDataset(datasets, embLookup)

# The training of the network starts here

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
        'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
embeddings_file = 'embeddings/komninos_english_embeddings.gz'
elmo_options_file = ''
elmo_weight_file = ''
elmo_mode = 'weighted_average'
elmo_cuda_device = -1  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See for an example.

pickleFile = perpareDataset(datasets, embLookup)

# The training of the network starts here

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['Softmax'],
Beispiel #3
def train_pos(args):
    # Data preprocessing
    datasets = {
        args.datasetName:  #Name of the dataset
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    embeddings_file = None
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights
    elmo_mode = 'weighted_average'
    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    elmo_cuda_device = args.cuda_device  #Which GPU to use. -1 for CPU

    embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                                   elmo_weight_file, elmo_mode,
    # You can use a cache to precompute the ELMo embeddings once. See for an example.

    pickleFile = perpareDataset(datasets, embLookup)

    # The training of the network starts here

    #Load the embeddings and the dataset
    mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.5, 0.5)

    model = ELMoBiLSTM(embLookup, params)
    model.setDataset(datasets, data)
    model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5"

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    # remove trained files except from the last file
    remove_except_last_model(save_dir, model_init)
formatter = logging.Formatter('%(message)s')

commentSymbol = None
columns = {tokenColId: 'tokens'}

picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"

embLookup = ELMoWordEmbeddings(None,
if os.path.isfile(picklePath):

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']
for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)

    totalSentences = len(sentences)
    sentCnt = 0
    for sentence in sentences: