Exemple #1
0
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
}

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::

embeddings_file = 'embeddings/bioEmbeddings.txt'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'average'
elmo_cuda_device = 0  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
embLookup.loadCache('embeddings/elmo_cache_deid.pkl')

pickleFile = perpareDataset(datasets, embLookup)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['CRF'],
        'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
}

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
embeddings_file = 'embeddings/komninos_english_embeddings.gz'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'weighted_average'
elmo_cuda_device = -1  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
embLookup.loadCache('embeddings/elmo_cache_conll2000_chunking.pkl')

pickleFile = perpareDataset(datasets, embLookup)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
params = {
    'classifier': ['Softmax'],
Exemple #3
0
def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    embeddings_file = None
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights
    elmo_mode = 'weighted_average'
    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    elmo_cuda_device = args.cuda_device  #Which GPU to use. -1 for CPU

    embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                                   elmo_weight_file, elmo_mode,
                                   elmo_cuda_device)
    # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
    embLookup.loadCache(args.pkl_path)

    pickleFile = perpareDataset(datasets, embLookup)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.5, 0.5)
    }

    model = ELMoBiLSTM(embLookup, params)
    model.setMappings(mappings)
    model.setDataset(datasets, data)
    model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5"
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    # remove trained files except from the last file
    remove_except_last_model(save_dir, model_init)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

commentSymbol = None
columns = {tokenColId: 'tokens'}

picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"

embLookup = ELMoWordEmbeddings(None,
                               elmo_options_file,
                               elmo_weight_file,
                               elmo_cuda_device=cudaDevice)
if os.path.isfile(picklePath):
    embLookup.loadCache(picklePath)

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']
for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)

    totalSentences = len(sentences)
    sentCnt = 0
    for sentence in sentences: