Exemple #1
0
def create_cache(args):
    datasetName = args.datasetName
    tokenColId = args.tokenColumnId
    cudaDevice = args.cuda_device
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights

    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file= 'pretrained/velmo_options.json'
    #elmo_weight_file = 'pretrained/velmo_weights.hdf5'

    # :: Logging level ::
    loggingLevel = logging.INFO
    logger = logging.getLogger()
    logger.setLevel(loggingLevel)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(loggingLevel)
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    commentSymbol = None
    columns = {tokenColId: 'tokens'}

    #picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_perturbed_03.pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_clean.pkl"
    picklePath = args.pkl_path
    embLookup = ELMoWordEmbeddings(None,
                                   elmo_options_file,
                                   elmo_weight_file,
                                   elmo_cuda_device=cudaDevice)

    print("ELMo Cache Generation")
    print("Output file:", picklePath)
    print("CUDA Device:", cudaDevice)

    splitFiles = ['train.txt', 'dev.txt', 'test.txt']

    for splitFile in splitFiles:
        inputPath = os.path.join('data', datasetName, splitFile)

        print("Adding file to cache: " + inputPath)
        sentences = readCoNLL(inputPath, columns, commentSymbol)
        tokens = [sentence['tokens'] for sentence in sentences]

        start_time = time.time()
        embLookup.addToCache(tokens)
        end_time = time.time()
        print("%s processed in %.1f seconds" %
              (splitFile, end_time - start_time))
        print("\n---\n")

    print("Store file at:", picklePath)
    embLookup.storeCache(picklePath)
# :: Transform datasets to a pickle file ::
pickleFile = perpareDataset(datasets)

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
embeddings_file = 'embeddings/komninos_english_embeddings.gz'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'weighted_average'

#Which GPU to use for ELMo. -1 for CPU
if torch.cuda.is_available():
    elmo_cuda_device = 0
else:
    elmo_cuda_device = -1

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)

# You can use a cache that stores the computed ELMo embeddings.
# This increases the training speed, as ELMo embeddings need to computed only once.
# However, it leads to a significant memory overhead of multiple GB (requires about 24KB per token).
#embLookup.cache_computed_elmo_embeddings = True

# We can add a pre-computed ELMo cache to the class. See Create_ELMo_Cache.py how to pre-compute such a cache.
#embLookup.loadCache('embeddings/elmo_cache_conll2000_chunking.pkl')

######################################################
#
# The training of the network starts here
#
######################################################
Exemple #3
0
logger = logging.getLogger()
logger.setLevel(loggingLevel)

ch = logging.StreamHandler(sys.stdout)
ch.setLevel(loggingLevel)
formatter = logging.Formatter('%(message)s')
ch.setFormatter(formatter)
logger.addHandler(ch)

commentSymbol = None
columns = {tokenColId: 'tokens'}

picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"

embLookup = ELMoWordEmbeddings(None,
                               elmo_options_file,
                               elmo_weight_file,
                               elmo_cuda_device=cudaDevice)

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']

for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)
    tokens = [sentence['tokens'] for sentence in sentences]
Exemple #4
0
        },
        'label': 'NER_BIO',  #Which column we like to predict
        'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
        'commentSymbol': None
    }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
}

# :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::

embeddings_file = 'embeddings/bioEmbeddings.txt'
elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
elmo_mode = 'average'
elmo_cuda_device = 0  #Which GPU to use. -1 for CPU

embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                               elmo_weight_file, elmo_mode, elmo_cuda_device)
# You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
embLookup.loadCache('embeddings/elmo_cache_deid.pkl')

pickleFile = perpareDataset(datasets, embLookup)

######################################################
#
# The training of the network starts here
#
######################################################

#Load the embeddings and the dataset
mappings, data = loadDatasetPickle(pickleFile)

# Some network hyperparameters
Exemple #5
0
def train_pos(args):
    ######################################################
    #
    # Data preprocessing
    #
    ######################################################
    datasets = {
        args.datasetName:  #Name of the dataset
        {
            'columns': {
                0: 'tokens',
                1: 'POS',
                2: 'chunk_BIO'
            },  #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding
            'label': 'POS',  #Which column we like to predict
            'evaluate': True,  #Should we evaluate on this task? Set true always for single task setups
            'commentSymbol': None
        }  #Lines in the input data starting with this string will be skipped. Can be used to skip comments
    }

    # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder ::
    embeddings_file = None
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights
    elmo_mode = 'weighted_average'
    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'

    elmo_cuda_device = args.cuda_device  #Which GPU to use. -1 for CPU

    embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file,
                                   elmo_weight_file, elmo_mode,
                                   elmo_cuda_device)
    # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example.
    embLookup.loadCache(args.pkl_path)

    pickleFile = perpareDataset(datasets, embLookup)

    ######################################################
    #
    # The training of the network starts here
    #
    ######################################################

    #Load the embeddings and the dataset
    mappings, data = loadDatasetPickle(pickleFile)

    # Some network hyperparameters
    params = {
        'classifier': ['CRF'],
        'LSTM-Size': [100, 100],
        'dropout': (0.5, 0.5)
    }

    model = ELMoBiLSTM(embLookup, params)
    model.setMappings(mappings)
    model.setDataset(datasets, data)
    model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5"
    model.fit(epochs=25)

    fpath = args.model_save + '/' + args.datasetName + '_1.h5'
    save_dir, model_init = os.path.split(fpath)
    print(save_dir)
    print(model_init)
    # remove trained files except from the last file
    remove_except_last_model(save_dir, model_init)