def load(checkpoint_path, config): """Load a model from a checkpoint First train the model like this: ``` python gaussian_process.py --save_dir=/its/home/tk324/tensorflow --model_name=m1 ``` You can of course also choose a different directory and model name. Args: checkpoint_path: path to the checkpoint file config: dictionary that contains all configuration Returns: Gaussian Process """ tfe.enable_eager_execution() # load GP model from checkpoint with tfe.restore_variables_on_create(checkpoint_path): gp = config['inf']([ config['cov'](config['input_dim'], { 'iso': config['iso'] }) for _ in range(config['output_dim']) ], config['lik']({ 'num_samples_pred': config.get('num_samples_pred', None) }), config['num_train'], config['num_inducing'], { 'num_components': config.get('num_components', None), 'diag_post': config.get('diag_post', None) }) return gp
def main(_): assert tfe.num_gpus() > 0, 'Make sure the GPU device exists' device_name = '/gpu:{}'.format(args.cuda_device) print('\n==> ==> ==> Using device {}'.format(device_name)) # Load the dataset train_ds, val_ds = [ dataset_generator( mode, conf.input_size, num_epochs=1, batch_size=conf.batch_size, buffer_size=10000) # TODO edit this when in real training for mode in ['train', 'val'] ] # Create the model and optimizer model = RetinaNet() optimizer = tf.train.RMSPropOptimizer(conf.learning_rate) # Define the path to the TensorBoard summary train_dir, val_dir = [ os.path.join(conf.summary_dir, mode) for mode in ['train', 'val'] ] tf.gfile.MakeDirs(conf.summary_dir) train_summary_writer = tf.contrib.summary.create_summary_file_writer( train_dir, flush_millis=10000, name='train') val_summary_writer = tf.contrib.summary.create_summary_file_writer( val_dir, flush_millis=10000, name='val') checkpoint_prefix = os.path.join(conf.checkpoint_dir, 'ckpt') with tfe.restore_variables_on_create( tf.train.latest_checkpoint(conf.checkpoint_dir)): with tf.device(device_name): epoch = tfe.Variable(1., name='epoch') best_loss = tfe.Variable(tf.float32.max, name='best_loss') print('==> ==> ==> Start training from epoch {:.0f}...\n'.format( epoch.numpy())) while epoch <= conf.num_epochs + 1: gs = tf.train.get_or_create_global_step() with train_summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, epoch.numpy()) with val_summary_writer.as_default(): eval_loss = validate(model, val_ds, epoch.numpy()) # Save the best loss if eval_loss < best_loss: best_loss.assign( eval_loss) # do NOT be copied directly, SHALLOW! all_variables = (model.variables + optimizer.variables() + [gs] + [epoch] + [best_loss]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=gs) epoch.assign_add(1)
def train_gp(dataset, args): """Train a GP model and return it. This function uses Tensorflow's eager execution. Args: dataset: a NamedTuple that contains information about the dataset args: parameters in form of a dictionary Returns: trained GP """ # Set checkpoint path if args['save_dir']: out_dir = Path(args['save_dir']) / Path(args['model_name']) tf.gfile.MakeDirs(str(out_dir)) else: out_dir = Path(mkdtemp()) # Create temporary directory checkpoint_prefix = out_dir / Path('model.ckpt') step_counter = tf.train.get_or_create_global_step() # Restore from existing checkpoint with tfe.restore_variables_on_create(tf.train.latest_checkpoint(out_dir)): gp, hyper_params = util.construct_from_flags(args, dataset, dataset.inducing_inputs) optimizer, update_learning_rate = util.get_optimizer(args) step = 0 # shuffle and repeat for the required number of epochs train_data = dataset.train_fn().shuffle(50_000).repeat( args['eval_epochs']).batch(args['batch_size']) while step < args['train_steps']: start = time.time() # take *at most* (train_steps - step) batches so that we don't run longer than `train_steps` fit(gp, optimizer, train_data.take(args['train_steps'] - step), step_counter, hyper_params, update_learning_rate, args) end = time.time() step = step_counter.numpy() print( f"Train time for the last {args['eval_epochs']} epochs (global step {step}):" f" {end - start:0.2f}s") evaluate(gp, dataset.test_fn().batch(args['batch_size']), dataset.metric) all_variables = (gp.get_all_variables() + optimizer.variables() + [step_counter] + hyper_params) # TODO: don't ignore the 'chkpnt_steps' flag ckpt_path = tfe.Saver(all_variables).save(checkpoint_prefix, global_step=step_counter) print(f"Saved checkpoint in '{ckpt_path}'") if args['plot'] or args['preds_path']: # Create predictions tf.reset_default_graph() mean, var = predict(dataset.xtest, tf.train.latest_checkpoint(out_dir), dataset, args) util.post_training(mean, var, out_dir, dataset, args) return gp
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') # Train and evaluate for 11 epochs. with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = (model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset .from_tensor_slices(data.train.images) .shuffle(60000) .batch(FLAGS.batch_size)) # Create the models and optimizers generator = Generator(data_format) discriminator = Discriminator(data_format) with tf.variable_scope('generator'): generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) with tf.variable_scope('discriminator'): discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) with tf.device(device): for epoch in range(1, 101): with tfe.restore_variables_on_create(latest_cpkt): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(generator, discriminator, generator_optimizer, discriminator_optimizer, dataset, FLAGS.log_interval, FLAGS.noise) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) all_variables = ( generator.variables + discriminator.variables + generator_optimizer.variables() + discriminator_optimizer.variables() + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def save_deployment(): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(conf.checkpoint_dir)): # epoch = tfe.Variable(1., name='epoch') # print('==> ==> ==> Restore from epoch {}...\n'.format(epoch.numpy())) gs = tf.train.get_or_create_global_step() print('==> ==> ==> Restore from global step {}...\n'.format( gs.numpy())) deploy_results = [] # batch images for im_batch, p_batch in tqdm(tfe.Iterator(dataset), total=dataset_size // conf.batch_size, unit=' batch({})'.format( conf.batch_size)): with tf.device(device_name): loc_preds, cls_preds = model(im_batch.gpu()) with tf.device("cpu:0"): scale = tf.convert_to_tensor( [*conf.image_size] * 2, dtype=tf.float32) / conf.input_size[0] # single image for i, (loc_pred, cls_pred) in enumerate( zip(loc_preds.cpu(), cls_preds.cpu())): boxes, labels_idx, scores = box_encoder.decode( loc_pred, cls_pred, conf.input_size, return_score=True) pred = [] # multiple boxes per image for box, label_idx, score in zip(boxes, labels_idx, scores): pt = (box * scale).numpy().astype( int) # [ymin, xmin, ymax, xmax] coords = (pt[1], pt[0]), pt[3] - pt[1] + 1, pt[2] - pt[0] + 1 label_name = conf.class_name[label_idx.numpy()] pred.append({ 'class': label_name, 'score': score.numpy(), 'position': coords }) deploy_results.append({ 'index': p_batch[i].numpy().decode('utf-8'), 'prediction': pred }) np.save(conf.deployment_save_dir, deploy_results) np.save('{}/deploy_results.npy'.format(conf.deployment_save_dir), deploy_results)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') # Train and evaluate for 11 epochs. with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = (model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets (train_ds, test_ds) = load_data(FLAGS.data_dir) train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size) # Create the model and optimizer model = MNISTModel(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_summary_file_writer( train_dir, flush_secs=10) test_summary_writer = tf.contrib.summary.create_summary_file_writer( test_dir, flush_secs=10, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') with tf.device(device): for epoch in range(1, 11): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = ( model.variables + tfe.get_optimizer_variables(optimizer) + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset.from_tensor_slices( data.train.images).shuffle(60000).batch(FLAGS.batch_size)) # Create the models and optimizers generator = Generator(data_format) discriminator = Discriminator(data_format) with tf.variable_scope('generator'): generator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) with tf.variable_scope('discriminator'): discriminator_optimizer = tf.train.AdamOptimizer(FLAGS.lr) # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) with tf.device(device): for epoch in range(1, 101): with tfe.restore_variables_on_create(latest_cpkt): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(generator, discriminator, generator_optimizer, discriminator_optimizer, dataset, FLAGS.log_interval, FLAGS.noise) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) all_variables = (generator.variables + discriminator.variables + generator_optimizer.variables() + discriminator_optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def init_agent(self): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(self._checkpoint_dir)): observation = tf.random_normal(self._input_shape) observation = observation[np.newaxis, :] if self._actionspace_is_discrete: action_probs = self._policy(observation) action = np.argmax(action_probs) else: mu, sigma = self._policy(observation) self.action_dist = self.ActionDist(tf.squeeze(mu), tf.squeeze(sigma)) value = self._valuef(observation)
def predict(test_inputs, saved_model, dataset_info, args): """Predict outputs given test inputs. This function can be called from a different module and should still work. Args: test_inputs: ndarray. Points on which we wish to make predictions. Dimensions: num_test * input_dim. saved_model: path to saved model dataset_info: info about the dataset args: additional parameters Returns: ndarray. The predicted mean of the test inputs. Dimensions: num_test * output_dim. ndarray. The predicted variance of the test inputs. Dimensions: num_test * output_dim. """ if args['batch_size'] is None: num_batches = 1 else: num_batches = util.ceil_divide(test_inputs.shape[0], args['batch_size']) num_inducing = dataset_info.inducing_inputs.shape[0] with tfe.restore_variables_on_create(saved_model): # Creating the inference object here will restore the variables from the saved model gp, _ = util.construct_from_flags(args, dataset_info, num_inducing) test_inputs = np.array_split(test_inputs, num_batches) pred_means = [0.0] * num_batches pred_vars = [0.0] * num_batches for i in range(num_batches): pred_means[i], pred_vars[i] = gp.predict({'input': test_inputs[i]}) return np.concatenate(pred_means, axis=0), np.concatenate(pred_vars, axis=0)
def main(_): # primeiro devemos habilitar o modo eager execution # aviso: este comando deve ser executado apenas uma vez. caso contrário um exceção será lançada tfe.enable_eager_execution() # aqui define-se o dispositivo que será utilizado para o treino da rede e o formato dos dados passados # de maneira geral, usamos o formato de dados default como definidos abaixo (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # carregando os datasets em treino e teste e embaralhando os exemplos # embaralhar os exemplos de treino auxilia a "quebrar" as dependências criadas pelo processamento # ordenado do dataset (train_ds, test_ds) = load_data(FLAGS.data_dir) train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size) # aqui nós criamos a classe contendo o modelo - veja detalhes na seção anterior model = MNISTModel(data_format) # aqui define-se o algoritmo que fará a otmização do modelo optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) # criamos um diretório para armazenar o modelo treinado if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None # aqui definimos o local no qual o programa armazenará os resumos gerados sobre o treino e sobre o teste # assim como instância a classe que vai realizar a gravação dos resumos summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') epoch = 1 # esta é a parte principal do programa # primeiro selecionamos o dispositivo que será usado para o treino - CPU ou GPU with tf.device(device): # criamos um laço para controlar as épocas # a biblioteca nos fornece diversas funções auxiliares pré-construídas para facilitar # a criação dos modelos. no caso abaixo, utilizamos uma função exclusiva do modo eager execution # que verifica se temos um modelo já salvo e utiliza para inicializar os parâmetros - pesos - da rede # isso é bastante útil quando o programa para no meio do treino e precisamos reiniciar a partir de um # certo ponto. no entanto, se temos um modelo salvo e queremos reiniciar do início, precisamos # excluir o modelo antigo with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): # esta linha serve para ativar ou buscar um contador interno da biblioteca que retorna o número do # 'batch' executado - se existe, busca o existente, caso contrário cria um novo global_step = tf.train.get_or_create_global_step() start = time.time() # utilizando a classe que grava os resumos do treino with summary_writer.as_default(): # executamos uma época de treino - ver código das funções auxiliares train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() # feito isso, imprimimos algumas informações na tela para facilitar o acompanhamento print('\nTrain time for epoch #%d (global step %d): %f' % (epoch, global_step.numpy(), end - start)) # utilizando a classe que grava os resumos do teste with test_summary_writer.as_default(): # executamos uma época de verificação sobre o dataset de teste test(model, test_ds) # criamos uma lista com todos os parâmetros do modelo (e do laço de treino) que estamos treinando all_variables = (model.variables + optimizer.variables() + [global_step]) # utilizando mais uma função utilitária (exclusiva do modo eager execution), # salvamos em disco os parâmetros do modelo que estamos treinando tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
def main(_): tfe.enable_eager_execution() # Log Info print("-" * 64) print("TEST INFO - EAGER") print("-" * 64) print("TF version:\t {}".format(tf.__version__)) print("Dataset:\t MNIST") print("Model:\t CNN") (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Device:\t {}'.format(device)) if data_format == 'channels_first': print("Data format:\t NCHW (channel first)") else: print("Data format:\t NHWC (channel last)") print("=" * 64) # Load the datasets (train_ds, test_ds) = load_data(FLAGS.data_dir) train_ds = train_ds.shuffle(60000).batch(FLAGS.batch_size) # Create the model and optimizer model = MNISTModel(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') with tf.device(device): for epoch in range(1, 6): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(model, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) all_variables = ( model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def train_spinn(embed, train_data, dev_data, test_data, config): """Train a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: 1. Final loss value on the test split. 2. Final fraction of correct classifications on the test split. """ use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer( config.logdir, flush_millis=10000) train_len = train_data.num_batches(config.batch_size) with tf.device(device), \ tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: all_variables = ( model.variables + [trainer.learning_rate] + [global_step]) saver = tfe.Saver(all_variables) saver.save(os.path.join(config.logdir, "ckpt"), global_step=global_step) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, model, trainer, use_gpu) print(dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print(log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, model, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0))
def main(FLAGS): # Decide device and data format ''' (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') if FLAGS.no_gpu or tfe.num_gpus() <= 0: device = "/cpu:0" else: device = "/gpu:0" ''' (device, data_format) = ('/cpu:0', 'channels_last') # Setup model, optimizer, logging hawkeye_net = HawkeyeYOLO(data_format=data_format, num_layer_1_filters=FLAGS.num_layer_1_filters, num_layer_2_filters=FLAGS.num_layer_2_filters, kernel_size=FLAGS.kernel_size) optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate) log_dir = os.path.join(FLAGS.dir, "summaries") tf.gfile.MakeDirs(log_dir) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') if FLAGS.pretrain_cnn: # Prepare data data_dir = FLAGS.dir train_data = load_training_dataset(data_dir) eval_data = load_eval_dataset(data_dir) # Batch the dataset train_data = train_data.batch(FLAGS.batch_size) train_summary_writer = tf.contrib.summary.create_file_writer( os.path.join(log_dir, "cnn_train"), flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( os.path.join(log_dir, "cnn_eval"), flush_millis=10000, name="eval") # Run training for specified epochs with tf.device(device): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): for epoch in range(FLAGS.num_epochs): start = time.time() with train_summary_writer.as_default(): train_cnn_one_epoch(hawkeye_net, optimizer, train_data, log_interval=FLAGS.log_interval) end = time.time() print("train/time for epoch #%d: %.2f" % (epoch, end - start)) with test_summary_writer.as_default(): pass #cnn_test(hawkeye_net, eval_data) # Save variables global_step = tf.train.get_or_create_global_step() all_variables = (hawkeye_net.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step) else: # Prepare data data_dir = FLAGS.dir train_data = load_sequence_training_dataset(data_dir) eval_data = load_sequence_eval_dataset(data_dir) # Batch the dataset train_data = train_data.batch(FLAGS.batch_size) # Train RNN model = RNNHawkeye(keep_prob=FLAGS.keep_probability, data_format=data_format) train_summary_writer = tf.contrib.summary.create_file_writer( os.path.join(log_dir, "rnn_train"), flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( os.path.join(log_dir, "rnn_eval"), flush_millis=10000, name="eval") # Run training for specified epochs with tf.device(device): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): for epoch in range(FLAGS.num_epochs): start = time.time() with train_summary_writer.as_default(): train_rnn_one_epoch(model, optimizer, train_data, hawkeye_net, FLAGS.log_interval) end = time.time() print("train/time for epoch #%d: %.2f" % (epoch, end - start)) with test_summary_writer.as_default(): pass #rnn_test(model, eval_data, hawkeye_net) # Save variables global_step = tf.train.get_or_create_global_step() all_variables = (hawkeye_net.variables + model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save(checkpoint_prefix, global_step=global_step)
package_dir = os.path.dirname(os.path.abspath(__file__)) default_img_path = os.path.join(package_dir,'test_2.png') parser = argparse.ArgumentParser(description='PyTorch MNIST Predictor') parser.add_argument('--image', type=str, default=default_img_path, metavar='IMG', help='image for prediction (default: {})'.format(default_img_path)) args = parser.parse_args() """Make Prediction""" # Load & transform image img = tf.image.decode_png(tf.read_file(args.image), channels=1) img = tf.image.resize_images(img, (28, 28)) img = ((img / 255) - 0.1307) / 0.3081 # Normalize img = tf.expand_dims(img, 0) # Squeeze in batch_size dim # Create model model = Net() # Load parameters; they will only be restored after the first run of the mode, in which variables in model are lazily created checkpoint_dir = os.path.dirname(os.path.abspath(__file__)) with tfe.restore_variables_on_create(tf.train.latest_checkpoint(checkpoint_dir)): global_step = tf.train.get_or_create_global_step() # Predict output = model(img, training=False) pred = tf.argmax(output, 1) print('Prediction: {}'.format(pred.numpy()[0]))
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, actor, critic, difference_critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, difference_critic_lr, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, checkpoint_dir, observation_noise_multiple, action_noise_multiple, use_difference_critic, tau=0.01, eval_env=None): assert (np.abs(env.action_space.low) == env.action_space.high ).all() # we assume symmetric actions. agent = DDPG(actor, critic, difference_critic, memory, env.observation_space.shape, env.action_space.shape, use_difference_critic=use_difference_critic, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, difference_critic_lr=difference_critic_lr, clip_norm=clip_norm, reward_scale=reward_scale, action_range=(env.action_space.low, env.action_space.high)) print('Using agent with the following configuration:') print(str(agent.__dict__.items())) max_action = env.action_space.high print('scaling actions by {} before executing in env'.format(max_action)) obs = env.reset() # inject noise to simulate model ensemble obs = apply_noise(obs, observation_noise_multiple) if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 time_step = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) tf.gfile.MakeDirs(checkpoint_dir) for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs) assert action.shape == env.action_space.shape # Execute next action. if render: env.render() # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) action = apply_noise(action, action_noise_multiple) action = tf.clip_by_value(action, env.action_space.low, env.action_space.high) new_obs, r, done, info = env.step(max_action * action) # inject noise new_obs = apply_noise(new_obs, observation_noise_multiple) time_step += 1 if render: env.render() episode_reward += r episode_step += 1 # Add to replay buffer. agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. episode_rewards_history.append(episode_reward) episode_reward = 0. episode_step = 0 episodes += 1 obs = env.reset() obs = apply_noise(obs, observation_noise_multiple) # Train. checkpoint_prefix = os.path.join(checkpoint_dir, 'ckpt') with tfe.restore_variables_on_create( tf.train.latest_checkpoint(checkpoint_dir)): for t_train in range(nb_train_steps): global_step = tf.train.get_or_create_global_step() agent.train(global_step) print '...' tfe.Saver(agent.get_variables_to_save()).save( checkpoint_prefix, global_step=tf.train.get_or_create_global_step()) # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) eval_obs, eval_r, eval_done, eval_info = eval_env.step( max_action * eval_action) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: print "eval episode reward: " + str( eval_episode_reward) eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append( eval_episode_reward) eval_episode_reward = 0.
def train_spinn(embed, train_data, dev_data, test_data, config): """Train a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: 1. Final loss value on the test split. 2. Final fraction of correct classifications on the test split. """ use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer(config.logdir, flush_millis=10000) train_len = train_data.num_batches(config.batch_size) with tf.device(device), \ tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: all_variables = (model.variables + [trainer.learning_rate] + [global_step]) saver = tfe.Saver(all_variables) saver.save(os.path.join(config.logdir, "ckpt"), global_step=global_step) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, model, trainer, use_gpu) print( dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print( log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, model, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0))
def main(_): tfe.enable_eager_execution() (device, data_format) = ('/gpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Set up writers if FLAGS.output_dir: train_dir = os.path.join(FLAGS.output_dir, 'train') valid_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None valid_dir = None summary_writer = tf.contrib.summary.create_file_writer( train_dir, flush_millis=10000) valid_summary_writer = tf.contrib.summary.create_file_writer( valid_dir, flush_millis=10000, name='valid') checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') # Set up data train_data = DataHandler( data_dir=FLAGS.data_dir, file_name='train_interpolated.csv', bags=config['bags'], batch_size=FLAGS.batch_size, train=True, augment=True ) train_ds = train_data.get_data() valid_data = DataHandler( data_dir=FLAGS.data_dir, file_name='valid_interpolated.csv', bags=config['bags'], batch_size=1, train=False, augment=False ) valid_ds = valid_data.get_data() # Set up model and optimizer model = PilotNet() optimizer = tf.train.AdamOptimizer() # Train and validate with tf.device(device): for epoch in range(FLAGS.epochs): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(FLAGS.checkpoint_dir)): global_step = tf.train.get_or_create_global_step() start = time.time() with summary_writer.as_default(): train_one_epoch(model, mse_loss, optimizer, train_ds, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (global step %d): %f' % ( epoch, global_step.numpy(), end - start)) with test_summary_writer.as_default(): validate(model, test_ds) all_variables = ( model.variables + optimizer.variables() + [global_step]) tfe.Saver(all_variables).save( checkpoint_prefix, global_step=global_step)
def train_or_infer_spinn(embed, word2index, train_data, dev_data, test_data, config): """Perform Training or Inference on a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. word2index: A `dict` mapping word to word index. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: If `config.inference_premise ` and `config.inference_hypothesis` are not `None`, i.e., inference mode: the logits for the possible labels of the SNLI data set, as numpy array of three floats. else: The trainer object. Raises: ValueError: if only one of config.inference_premise and config.inference_hypothesis is specified. """ # TODO(cais): Refactor this function into separate one for training and # inference. use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) if ((config.inference_premise and not config.inference_hypothesis) or (not config.inference_premise and config.inference_hypothesis)): raise ValueError( "--inference_premise and --inference_hypothesis must be both " "specified or both unspecified, but only one is specified.") if config.inference_premise: # Inference mode. inference_sentence_pair = [ data.encode_sentence(config.inference_premise, word2index), data.encode_sentence(config.inference_hypothesis, word2index) ] else: inference_sentence_pair = None log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer(config.logdir, flush_millis=10000) with tf.device(device), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): with tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) if inference_sentence_pair: # Inference mode. with tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)): prem, prem_trans = inference_sentence_pair[0] hypo, hypo_trans = inference_sentence_pair[1] hypo_trans = inference_sentence_pair[1][1] inference_logits = model( # pylint: disable=not-callable tf.constant(prem), tf.constant(prem_trans), tf.constant(hypo), tf.constant(hypo_trans), training=False) inference_logits = np.array(inference_logits[0][1:]) max_index = np.argmax(inference_logits) print("\nInference logits:") for i, (label, logit) in enumerate( zip(data.POSSIBLE_LABELS, inference_logits)): winner_tag = " (winner)" if max_index == i else "" print(" {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag)) return inference_logits train_len = train_data.num_batches(config.batch_size) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 with tfe.restore_variables_on_create( tf.train.latest_checkpoint(config.logdir)): batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: all_variables = trainer.variables + [global_step] saver = tfe.Saver(all_variables) saver.save(os.path.join(config.logdir, "ckpt"), global_step=global_step) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, trainer, use_gpu) print( dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print( log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0)) return trainer