if len(sys.argv) > 2: num_hidden = int(sys.argv[2]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 3: learning_rate = float(sys.argv[3]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, learning_rate) print num_hidden, learning_rate, model_file_name word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) x = T.imatrix('x') y = T.imatrix('y') lr = T.scalar('lr') continue_with_previous = False if os.path.isfile(model_file_name): while True: resp = raw_input("Found an existing model with the name %s. Do you want to:\n[c]ontinue training the existing model?\n[r]eplace the existing model and train a new one?\n[e]xit?\n>" % model_file_name) resp = resp.lower().strip() if resp not in ('c', 'r', 'e'): continue if resp == 'e': sys.exit()
num_hidden = int(sys.argv[3]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 4: initial_learning_rate = float(sys.argv[4]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s.pcl" % (model_name, num_hidden, initial_learning_rate) model_file = model_path + "/" + model_file_name print num_hidden, initial_learning_rate, model_file word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) x = T.imatrix('x') y = T.imatrix('y') lr = T.scalar('lr') continue_with_previous = False if os.path.isfile(model_file): print "Found an existing model with the name %s" % model_file sys.exit() if continue_with_previous: print "Loading previous model state" net, state = models.load(model_file, MINIBATCH_SIZE, x)
def trainModel(model, xTrain, yTrain, xVal, yVal, num_gpus, model_file, logdir, callbacks=None, verbose=False): sys.stderr.write("Training" + "\n") #From https://github.com/flomlo/ntm_keras/blob/master/testing_utils.py tensorboard = TensorBoard(log_dir=logdir, batch_size=MINIBATCH_SIZE, histogram_freq=1, write_grads=True, write_images=True) #, embeddings_freq=1, embeddings_layer_names='embedding', embeddings_metadata=model_path + '/logs' + 'metadata.tsv', embeddings_data=xTrain) checkpoint = ModelCheckpoint(logdir + "/model.ckpt.{epoch:04d}.hdf5", monitor='val_loss', verbose=1, save_best_only=True, period=1) early_stopping = EarlyStopping(monitor='val_loss', min_delta=0, patience=0, verbose=1) cbs = [tensorboard, early_stopping, checkpoint] # TerminateOnNaN, if verbose: for i in range(0, EPOCHS): model.fit(xTrain, yTrain, validation_data=(xVal, yVal), epochs=i+1, batch_size=MINIBATCH_SIZE, callbacks=cbs, initial_epoch=i) print("currently at epoch {0}".format(i+1)) # Some test function model.save(model_file + '{epoch:02d}.hdf5') else: for i in range(num_gpus): try: os.environ["CUDA_VISIBLE_DEVICES"]=str(i) model.fit(xTrain, yTrain, validation_data=(xVal, yVal), epochs=EPOCHS, batch_size=MINIBATCH_SIZE, callbacks=cbs) break except InternalError: print("GPU {0} is not available".format(str(i))) model.save(model_file + '.hdf5') return model if __name__ == "__main__": if len(sys.argv) > 1: model_path = os.path.abspath(sys.argv[1]) else: sys.exit("'Model path' argument missing!") if len(sys.argv) > 2: model_name = sys.argv[2] else: sys.exit("'Model name' argument missing!") if len(sys.argv) > 3: num_hidden = int(sys.argv[3]) else: sys.exit("'Hidden layer size' argument missing!") if len(sys.argv) > 4: lr = float(sys.argv[4]) else: sys.exit("'Learning rate' argument missing!") model_file_name = "Model_%s_h%d_lr%s" % (model_name, num_hidden, lr) model_file = model_path + "/" + model_file_name logdir = model_path + "/logs/" + model_file_name print(num_hidden, lr, model_file) word_vocabulary = data.read_vocabulary(data.WORD_VOCAB_FILE) punctuation_vocabulary = data.iterable_to_dict(data.PUNCTUATION_VOCABULARY) #print(punctuation_vocabulary) continue_with_previous = False if os.path.isfile(model_file): print("Found an existing model with the name %s" % model_file) sys.exit() import subprocess, re gpu_info = subprocess.check_output(('lspci')) num_gpus = len(re.findall('VGA compatible controller: NVIDIA Corporation', str(gpu_info), flags=0)) #print("train file: ",data.TRAIN_FILE) xTrain, yTrain = get_data(data.TRAIN_FILE,True) xVal, yVal = get_data(data.DEV_FILE,False) #print('Shape of data tensor:', xTrain.shape) #print('Shape of label tensor:', yTrain.shape) #print('xTrain.shape[0]: ', xTrain.shape[0]) # with open('xTrain.tmp','w',encoding='utf-8') as fout: # print(xTrain, file=fout) # with open('yTrain.tmp','w',encoding='utf-8') as fout2: # print(yTrain, file=fout2) # print('xTrain: ', xTrain[:5]) # print('yTrain: ', yTrain[:5]) # print('xVal: ', xVal[:5]) # print('yVal: ', yVal[:5]) model = createModel(num_hidden, lr) trainModel(model, xTrain, yTrain, xVal, yVal, num_gpus, model_file, logdir)