def create_cache(args): datasetName = args.datasetName tokenColId = args.tokenColumnId cudaDevice = args.cuda_device elmo_options_file = args.elmo_options elmo_weight_file = args.elmo_weights #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' #elmo_options_file= 'pretrained/velmo_options.json' #elmo_weight_file = 'pretrained/velmo_weights.hdf5' # :: Logging level :: loggingLevel = logging.INFO logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) commentSymbol = None columns = {tokenColId: 'tokens'} #picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl" #picklePath = "embeddings/velmo_cache_conll2000_data_perturbed_03.pkl" #picklePath = "embeddings/velmo_cache_conll2000_data_clean.pkl" picklePath = args.pkl_path embLookup = ELMoWordEmbeddings(None, elmo_options_file, elmo_weight_file, elmo_cuda_device=cudaDevice) print("ELMo Cache Generation") print("Output file:", picklePath) print("CUDA Device:", cudaDevice) splitFiles = ['train.txt', 'dev.txt', 'test.txt'] for splitFile in splitFiles: inputPath = os.path.join('data', datasetName, splitFile) print("Adding file to cache: " + inputPath) sentences = readCoNLL(inputPath, columns, commentSymbol) tokens = [sentence['tokens'] for sentence in sentences] start_time = time.time() embLookup.addToCache(tokens) end_time = time.time() print("%s processed in %.1f seconds" % (splitFile, end_time - start_time)) print("\n---\n") print("Store file at:", picklePath) embLookup.storeCache(picklePath)
# :: Transform datasets to a pickle file :: pickleFile = perpareDataset(datasets) # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = 'embeddings/komninos_english_embeddings.gz' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_mode = 'weighted_average' #Which GPU to use for ELMo. -1 for CPU if torch.cuda.is_available(): elmo_cuda_device = 0 else: elmo_cuda_device = -1 embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache that stores the computed ELMo embeddings. # This increases the training speed, as ELMo embeddings need to computed only once. # However, it leads to a significant memory overhead of multiple GB (requires about 24KB per token). #embLookup.cache_computed_elmo_embeddings = True # We can add a pre-computed ELMo cache to the class. See Create_ELMo_Cache.py how to pre-compute such a cache. #embLookup.loadCache('embeddings/elmo_cache_conll2000_chunking.pkl') ###################################################### # # The training of the network starts here # ######################################################
logger = logging.getLogger() logger.setLevel(loggingLevel) ch = logging.StreamHandler(sys.stdout) ch.setLevel(loggingLevel) formatter = logging.Formatter('%(message)s') ch.setFormatter(formatter) logger.addHandler(ch) commentSymbol = None columns = {tokenColId: 'tokens'} picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl" embLookup = ELMoWordEmbeddings(None, elmo_options_file, elmo_weight_file, elmo_cuda_device=cudaDevice) print("ELMo Cache Generation") print("Output file:", picklePath) print("CUDA Device:", cudaDevice) splitFiles = ['train.txt', 'dev.txt', 'test.txt'] for splitFile in splitFiles: inputPath = os.path.join('data', datasetName, splitFile) print("Adding file to cache: " + inputPath) sentences = readCoNLL(inputPath, columns, commentSymbol) tokens = [sentence['tokens'] for sentence in sentences]
}, 'label': 'NER_BIO', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = 'embeddings/bioEmbeddings.txt' elmo_options_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' elmo_weight_file = 'https://s3-us-west-2.amazonaws.com/allennlp/models/elmo/2x4096_512_2048cnn_2xhighway_5.5B/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_mode = 'average' elmo_cuda_device = 0 #Which GPU to use. -1 for CPU embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example. embLookup.loadCache('embeddings/elmo_cache_deid.pkl') pickleFile = perpareDataset(datasets, embLookup) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters
def train_pos(args): ###################################################### # # Data preprocessing # ###################################################### datasets = { args.datasetName: #Name of the dataset { 'columns': { 0: 'tokens', 1: 'POS', 2: 'chunk_BIO' }, #CoNLL format for the input data. Column 0 contains tokens, column 1 contains POS and column 2 contains chunk information using BIO encoding 'label': 'POS', #Which column we like to predict 'evaluate': True, #Should we evaluate on this task? Set true always for single task setups 'commentSymbol': None } #Lines in the input data starting with this string will be skipped. Can be used to skip comments } # :: Prepares the dataset to be used with the LSTM-network. Creates and stores cPickle files in the pkl/ folder :: embeddings_file = None elmo_options_file = args.elmo_options elmo_weight_file = args.elmo_weights elmo_mode = 'weighted_average' #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json' #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5' elmo_cuda_device = args.cuda_device #Which GPU to use. -1 for CPU embLookup = ELMoWordEmbeddings(embeddings_file, elmo_options_file, elmo_weight_file, elmo_mode, elmo_cuda_device) # You can use a cache to precompute the ELMo embeddings once. See Create_ELMo_Cache.py for an example. embLookup.loadCache(args.pkl_path) pickleFile = perpareDataset(datasets, embLookup) ###################################################### # # The training of the network starts here # ###################################################### #Load the embeddings and the dataset mappings, data = loadDatasetPickle(pickleFile) # Some network hyperparameters params = { 'classifier': ['CRF'], 'LSTM-Size': [100, 100], 'dropout': (0.5, 0.5) } model = ELMoBiLSTM(embLookup, params) model.setMappings(mappings) model.setDataset(datasets, data) model.modelSavePath = args.model_save + "/[ModelName]_[Epoch].h5" model.fit(epochs=25) fpath = args.model_save + '/' + args.datasetName + '_1.h5' save_dir, model_init = os.path.split(fpath) print(save_dir) print(model_init) # remove trained files except from the last file remove_except_last_model(save_dir, model_init)