def load_eager(name, model, optimizer=None): checkpoint_dir = "./data/" + name checkpoint_prefix = os.path.join(checkpoint_dir, name + "_ckpt") if optimizer: root = tfe.Checkpoint(optimizer=optimizer, model=model) else: root = tfe.Checkpoint(model=model) root.restore(tf.train.latest_checkpoint(checkpoint_dir))
def save_eager(name, model, optimizer=None): checkpoint_dir = "./data/" + name os.makedirs(checkpoint_dir, exist_ok=True) checkpoint_prefix = os.path.join(checkpoint_dir, name + "_ckpt") if optimizer: root = tfe.Checkpoint(optimizer=optimizer, model=model) else: root = tfe.Checkpoint(model=model) root.save(file_prefix=checkpoint_prefix)
def main(_): tfe.enable_eager_execution() # Automatically determine device and data_format (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') # If data_format is defined in FLAGS, overwrite automatically set value. if FLAGS.data_format is not None: data_format = data_format print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets train_ds = mnist_dataset.train(FLAGS.data_dir).shuffle(60000).batch( FLAGS.batch_size) test_ds = mnist_dataset.test(FLAGS.data_dir).batch(FLAGS.batch_size) # Create the model and optimizer model = mnist.Model(data_format) optimizer = tf.train.MomentumOptimizer(FLAGS.lr, FLAGS.momentum) # Create file writers for writing TensorBoard summaries. if FLAGS.output_dir: # Create directories to which summaries will be written # tensorboard --logdir=<output_dir> # can then be used to see the recorded summaries. train_dir = os.path.join(FLAGS.output_dir, 'train') test_dir = os.path.join(FLAGS.output_dir, 'eval') tf.gfile.MakeDirs(FLAGS.output_dir) else: train_dir = None test_dir = None summary_writer = tf.contrib.summary.create_file_writer(train_dir, flush_millis=10000) test_summary_writer = tf.contrib.summary.create_file_writer( test_dir, flush_millis=10000, name='test') # Create and restore checkpoint (if one exists on the path) checkpoint_prefix = os.path.join(FLAGS.model_dir, 'ckpt') step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint(model=model, optimizer=optimizer, step_counter=step_counter) # Restore variables on creation if a checkpoint exists. checkpoint.restore(tf.train.latest_checkpoint(FLAGS.model_dir)) # Train and evaluate for a set number of epochs. with tf.device(device): for _ in range(FLAGS.train_epochs): start = time.time() with summary_writer.as_default(): train(model, optimizer, train_ds, step_counter, FLAGS.log_interval) end = time.time() print('\nTrain time for epoch #%d (%d total steps): %f' % (checkpoint.save_counter.numpy() + 1, step_counter.numpy(), end - start)) with test_summary_writer.as_default(): test(model, test_ds) checkpoint.save(checkpoint_prefix)
def save_checkpoint(self, epoch, model): """ Create and save a checkpoint. """ # Create the checkpoint directory if required self.ckpt_dir = os.path.join(self.session_dir, 'checkpoints') if not os.path.exists(self.ckpt_dir): os.makedirs(self.ckpt_dir) self.checkpoint = tfe.Checkpoint(optimizer=self.optimizer, model=model.arch) # Note: This allows the user to specify how many checkpoints should be saved. # Tensorflow does not expose the parameter in tfe.Checkpoint for max_to_keep, # however under the hood it uses a Saver object so we can hack around this. from tensorflow.python.training.saver import Saver default_args = list(Saver.__init__.__code__.co_varnames) default_values = list(Saver.__init__.__defaults__) if 'self' in default_args: # Subtract one since default_values has no value for 'self' idx = default_args.index('max_to_keep') - 1 default_values[idx] = self.p.max_num_ckpts_to_keep Saver.__init__.__defaults__ = tuple(default_values) else: assert (False) # Save the checkpoint if epoch % self.p.ckpt_save_frequency == 0: self.checkpoint.save(os.path.join(self.ckpt_dir, 'ckpt')) else: return
def checkpoint_load(_checkpoint_path, neural_kb, optimizer): logger.info('Loading model...') logger.info(' neural kb and optimizer') checkpoint_model_prefix = os.path.join(_checkpoint_path, "model/") model_saver_path = tf.train.latest_checkpoint(checkpoint_model_prefix) # old format compatibility if os.path.exists(os.path.join(_checkpoint_path, "optim/")): import tensorflow.contrib.eager as tfe checkpoint_optim_prefix = os.path.join(_checkpoint_path, "optim/") optim_checkpoint_path = tf.train.latest_checkpoint( checkpoint_optim_prefix) if optim_checkpoint_path is not None: optim_checkpoint = tfe.Checkpoint( optimizer=optimizer, optimizer_step=tf.train.get_or_create_global_step()) optim_checkpoint.restore(optim_checkpoint_path) logger.info(' optimiser') else: logger.info( " ....couldn't find optim/, ignoring it (loading old model)." ) model_saver = tfe.Saver(neural_kb.variables) model_saver.restore(model_saver_path) else: model_saver = tf.train.Saver(neural_kb.variables + optimizer.variables() + [tf.train.get_or_create_global_step()]) model_saver.restore(None, model_saver_path) logger.info('... loading done.')
def load(self, model, filename): model_objects = {'model': model} print("=> loading checkpoint '{}'".format(filename)) ckpt = tfe.Checkpoint(**model_objects) ckpt.restore(filename) return model_objects['model']
def __init__(self, cfg, net, trainingset, valset, resume=False): self.cfg = cfg self.net = net # Datasets self.trainingset = trainingset self.valset = valset # Using Adam optimizer self.optimizer = tf.train.AdamOptimizer( learning_rate=self.cfg.LEARNING_RATE) # Create global step self.global_step = tf.train.get_or_create_global_step() # Create checkpoint directory and save checkpoints self.epoch = tfe.Variable(0, name='epoch', dtype=tf.float32, trainable=False) self.checkpoint_dir = self.cfg.CKPT_PATH self.checkpoint_encoder = os.path.join(self.checkpoint_dir, 'model') self.root1 = tfe.Checkpoint( optimizer=self.optimizer, model=self.net, optimizer_step=tf.train.get_or_create_global_step()) # If resume is true continue from saved checkpoint if resume: self.root1.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
def main(args): xr, log_igfr_r, labels_r = loadData('NEW_GFR_TRAIN') xe, log_igfr_e, labels_e = loadData('NEW_GFR_TEST') train_ds = tf.data.Dataset.from_tensor_slices((xr, log_igfr_r, labels_r)) test_ds = tf.data.Dataset.from_tensor_slices((xe, log_igfr_e, labels_e)) train_ds = train_ds.shuffle(xr.shape[0]).batch(batch_size) # test_ds = test_ds.batch(batch_size) test_ds = test_ds.batch(1) model = KidneyModel(n_cat) init_lr, momentum = args.learning_rate, 0.9 lr = tfe.Variable(init_lr, name="learning_rate") optimizer = tf.train.AdamOptimizer(lr) with tf.device('/cpu:0'): lr = tfe.Variable(init_lr, name="learning_rate") optimizer = tf.train.AdamOptimizer(lr) for epoch in range(args.epochs): print('epoch', epoch) train_acc = tfe.metrics.Accuracy('train_accuracy') total_loss, total_batch = 0.0, 0.0 for (batch, (x, log_igfr, labels)) in enumerate(tfe.Iterator(train_ds)): with tf.GradientTape() as tape: mean, var, logits, igfr = model(x) loss_value = loss(mean, var, logits, igfr, labels, log_igfr, args.enlarge, args.w_div, args.w_l2) total_loss += loss_value.cpu().numpy() total_batch += 1 train_acc(tf.argmax(logits, axis=1, output_type=tf.int32), tf.argmax(labels, axis=1, output_type=tf.int32)) grads = tape.gradient(loss_value, model.variables) optimizer.apply_gradients( zip(grads, model.variables), global_step=tf.train.get_or_create_global_step()) print('Learning Rate', lr.numpy()) if (epoch + 1) % 50 == 0: lr.assign(lr.numpy() / 2) print('Training acc {}'.format(100 * train_acc.result())) print('train_acc', 100 * train_acc.result().cpu().numpy()) test_acc = test(model, test_ds) test2_acc, reses, test3_acc, reses3 = test23(model, test_ds) print('test_acc1', test_acc) print('avg_loss ', total_loss / total_batch) print('test_acc2', test2_acc) print('test_acc3', test3_acc) for i in range(reses.shape[0]): print('Cate %d ' % i, reses[i]) checkpoint_dir = './saved_models/' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") root = tfe.Checkpoint(optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) root.save(file_prefix=checkpoint_dir)
def run(self): T = 0 # Worker step counter. env = make_atari(self.config.env) model = ActorCritic(env.action_space.n, policy=self.config.policy, device=self.config.device) loader = tfe.Checkpoint(model=model.policy) obs, act, rew = [], [], [] ob = env.reset() done = False s = model.policy.s0 cum_reward = 0 ep_len = 0 while T < self.steps: try: loader.restore(tf.train.latest_checkpoint(self.config.save_dir)) except: continue t = 0 # Batch counter. s_init = s epsilon = 0.6 - (T / self.steps) * 0.5 while not done and t < self.batch_size: logits, v, s = model.forward([ob], s) probs = tf.nn.softmax(logits) a = greedy(probs, env.action_space.n, epsilon=epsilon) next_ob, r, done, _ = env.step(a) obs.append(ob) act.append(a) rew.append(r) ob = next_ob t += 1 T += 1 cum_reward += r ep_len += 1 d_rew = discount(rew, self.config.gamma) d_rew = (d_rew - np.mean(d_rew)) / (np.std(d_rew) + 1e-6) # Stability constant. grads, loss = model.gradient(obs, d_rew, act, s_init) grads, _ = tf.clip_by_global_norm(grads, self.config.max_norm) if done: print(f"Step: {T}, Len: {ep_len}, BR: {cum_reward}, TL: {loss:.4f}, Epsilon: {epsilon:.2f}") s = model.policy.s0 done = False ob = env.reset() cum_reward = 0 ep_len = 0 obs.clear() act.clear() rew.clear() for i in range(len(grads)): grads[i] = grads[i].numpy() self.queue.put(grads)
def restore_model(model, optimizer): # model是重新初始化的 # 指定checkpoint目录 checkpoint_directory = 'models_checkpoints/SimpleNN/' # Create model checkpoint checkpoint = tfe.Checkpoint( optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) checkpoint.restore(tf.train.latest_checkpoint(checkpoint_directory))
def restore_checkpoint(self, model): """ Load a given checkpoint. """ # Create a checkpoint self.checkpoint = tfe.Checkpoint(optimizer=self.create_optimizer(), model=model.arch) # Restore the checkpoint self.checkpoint.restore(self.p.ckpt_path)
def initialize_process(): global env, model, loader, config, logger config = Config("config/config.json") env = make_atari(config.env) # env = gym.make(config.env) model = ActorCritic(env.action_space.n, policy=config.policy) loader = tfe.Checkpoint( model=model.policy, optimizer_step=tf.train.get_or_create_global_step()) logger = Logger("Worker_{}".format(os.getpid()))
def sample(queue, env_name, steps): env = make_atari(env_name) model = CRPolicy(env.action_space.n) loader = tfe.Checkpoint(model=model) for roll in range(steps): # TODO: REMOVE THIS CRAP WHEN YOU FIX IT try: loader.restore(tf.train.latest_checkpoint(CKPT_DIR)) except: continue obs, act, rews = [], [], [] ob = env.reset() done = False s = model.s0 while not done: logits, v, s = model([ob], s) probs = tf.nn.softmax(logits) a = boltzmann(probs, env.action_space.n) next_ob, r, done, _ = env.step(a) obs.append(ob) act.append(a) rews.append(r) ob = next_ob d_rews = discount(rews, GAMMA) d_rew = (d_rews - np.mean(d_rews)) / (np.std(d_rews) + 1e-6) with tf.GradientTape() as tape: logits, values, _ = model(obs, model.s0) values = tf.squeeze(values) advs = tf.constant(d_rew, dtype=tf.float32) - values policy = tf.nn.softmax(logits) xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=act, logits=logits) p_loss = xentropy * tf.stop_gradient(advs) v_loss = tf.square(advs) e_loss = tf.nn.softmax_cross_entropy_with_logits_v2(labels=policy, logits=logits) loss = tf.reduce_mean(p_loss + 0.5 * v_loss - 0.01 * e_loss) grads = tape.gradient(loss, model.trainable_weights) grads, _ = tf.clip_by_global_norm(grads, MAX_GRAD_NORM) print("Step: {0}, Len: {1} BR: {2}, TL: {3:.4f}".format( roll, len(obs), np.sum(rews), loss)) for i in range(len(grads)): grads[i] = grads[i].numpy() queue.put(grads)
def fit_and_save(model, optimizer, input_data, target): model.fit(input_data, target, optimizer, num_epochs=500, verbose=50) # Specify checkpoint directory checkpoint_directory = 'models_checkpoints/SimpleNN/' # Create model checkpoint checkpoint = tfe.Checkpoint( optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) # Save trained model checkpoint.save(file_prefix=checkpoint_directory)
def build_model(self, initializer=tf.zeros): self.model = AtariModel(self.obs_spec["screen"][1], self.obs_spec["minimap"][1], possible_action_num) # TODO: Training optimizer = tf.train.AdamOptimizer(learning_rate=0.001) self.root = tfe.Checkpoint( optimizer=optimizer, model=self.model, optimizer_step=tf.train.get_or_create_global_step())
def create_checkpoint(self, name): if name: checkpoint = tfe.Checkpoint(**self.get_str2weights()) checkpoint_dir = "./model/" + name checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") try: checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) except Exception as e: print(e) return checkpoint, checkpoint_prefix else: return None, None
def main(_): pp = pprint.PrettyPrinter() pp.pprint(flags.FLAGS.__flags) filenames = glob.glob(data_dir) (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) if not os.path.exists(FLAGS.checkpoint_dir): os.makedirs(FLAGS.checkpoint_dir) if not os.path.exists(FLAGS.sample_dir): os.makedirs(FLAGS.sample_dir) model_objects = { 'generator': Generator(data_format), 'discriminator': Discriminator(data_format), 'generator_optimizer': tf.train.AdamOptimizer(FLAGS.generator_learning_rate, FLAGS.beta1, FLAGS.beta2), 'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.discriminator_learning_rate, FLAGS.beta1, FLAGS.beta2), 'step_counter': tf.train.get_or_create_global_step() } summary_writer = tf.contrib.summary.create_file_writer(FLAGS.summary_dir, flush_millis=1000) checkpoint = tfe.Checkpoint(**model_objects) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) checkpoint.restore(latest_cpkt) dataset = tf.data.TFRecordDataset( filenames).map(read_and_decode_with_labels) dataset = dataset.shuffle(10000).apply( tf.contrib.data.batch_and_drop_remainder(FLAGS.batch_size)) with tf.device(device): for epoch in range(FLAGS.epoch): start = time.time() with summary_writer.as_default(): train_one_epoch(dataset=dataset, batch_size=FLAGS.batch_size, log_interval=FLAGS.log_interval, z_dim=FLAGS.z_dim, device=device, epoch=epoch, **model_objects) end = time.time() checkpoint.save(checkpoint_prefix) print('\nTrain time for epoch #%d (step %d): %f' % (checkpoint.save_counter.numpy(), checkpoint.step_counter.numpy(), end - start))
def __init__(self, cfg, net, testset): self.cfg = cfg self.net = net self.testset = testset # Restore the model self.optimizer = tf.train.MomentumOptimizer( learning_rate=self.cfg.LEARNING_RATE, momentum=self.cfg.MOMENTUM) self.checkpoint_dir = self.cfg.CKPT_PATH self.checkpoint_encoder = os.path.join(self.checkpoint_dir, 'Model') self.root1 = tfe.Checkpoint( optimizer=self.optimizer, model=self.net, optimizer_step=tf.train.get_or_create_global_step()) self.root1.restore(tf.train.latest_checkpoint(self.checkpoint_dir))
def main(): args = setup_args() log_msg(args) vocab_table = lookup_ops.index_table_from_file(args.vocab, default_value=args.unk_index) train_dataset = create_dataset(args.train, vocab_table, args.bs, args.eos, args.t) valid_dataset = create_dataset(args.valid, vocab_table, args.bs, args.eos, args.t) loss_and_grads_fun = tfe.implicit_value_and_gradients(train_loss) lm = LanguageModel(int(vocab_table.size()), d=args.nd, h=args.nh, cell=args.cell) log_msg('Model built!') best_valid_ppl = compute_ppl(lm, valid_dataset) log_msg(f'Start ppl: {best_valid_ppl: 0.4f}') if args.opt == 'adam': opt = tf.train.AdamOptimizer(args.lr) else: opt = tf.train.GradientDescentOptimizer(args.lr) if not os.path.exists(args.save_dir): os.mkdir(args.save_dir) ckpt_prefix = os.path.join(args.save_dir, args.ckpt_prefix) root = tfe.Checkpoint(optimizer=opt, model=lm, optimizer_step=tf.train.get_or_create_global_step()) for epoch_num in range(args.num_epochs): log_msg(f'Epoch: {epoch_num} START') batch_loss = [] for step_num, train_datum in enumerate(train_dataset, start=1): loss_value, gradients = loss_and_grads_fun(lm, train_datum) batch_loss.append(loss_value) if step_num % args.stats_step == 0: log_msg(f'Epoch: {epoch_num} Step: {step_num} Avg Loss: {np.average(np.asarray(loss_value)): 0.4f}') batch_loss = [] if step_num % args.eval_step == 0: better, ppl = check_if_ppl_better(best_valid_ppl, lm, valid_dataset, root, ckpt_prefix, epoch_num, step_num) if better: best_valid_ppl = ppl opt.apply_gradients(clip_gradients(gradients, args.clip_ratio)) log_msg(f'Epoch: {epoch_num} END') better, ppl = check_if_ppl_better(best_valid_ppl, lm, valid_dataset, root, ckpt_prefix, epoch_num, step_num=-1) if better: best_valid_ppl = ppl
def test(): config = Config("config/config.json") env = make_atari(config.env) model = ActorCritic(env.action_space.n, policy=config.policy) saver = tfe.Checkpoint(model=model.policy) saver.restore(tf.train.latest_checkpoint(config.save_dir)) ob = env.reset() s = model.policy.s0 while True: env.render() logits, _, s = model.forward([ob], s) probs = tf.nn.softmax(logits) a = greedy(probs) ob, _, done, _ = env.step(a) if done: ob = env.reset()
def main(_): (device, data_format) = ('/gpu:0', 'channels_first') if FLAGS.no_gpu or tfe.num_gpus() <= 0: (device, data_format) = ('/cpu:0', 'channels_last') print('Using device %s, and data format %s.' % (device, data_format)) # Load the datasets data = input_data.read_data_sets(FLAGS.data_dir) dataset = (tf.data.Dataset.from_tensor_slices( data.train.images).shuffle(60000).batch(FLAGS.batch_size)) # Create the models and optimizers. model_objects = { 'generator': Generator(data_format), 'discriminator': Discriminator(data_format), 'generator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'discriminator_optimizer': tf.train.AdamOptimizer(FLAGS.lr), 'step_counter': tf.train.get_or_create_global_step(), } # Prepare summary writer and checkpoint info summary_writer = tf.contrib.summary.create_summary_file_writer( FLAGS.output_dir, flush_millis=1000) checkpoint_prefix = os.path.join(FLAGS.checkpoint_dir, 'ckpt') latest_cpkt = tf.train.latest_checkpoint(FLAGS.checkpoint_dir) if latest_cpkt: print('Using latest checkpoint at ' + latest_cpkt) checkpoint = tfe.Checkpoint(**model_objects) # Restore variables on creation if a checkpoint exists. checkpoint.restore(latest_cpkt) with tf.device(device): for _ in range(100): start = time.time() with summary_writer.as_default(): train_one_epoch(dataset=dataset, log_interval=FLAGS.log_interval, noise_dim=FLAGS.noise, **model_objects) end = time.time() checkpoint.save(checkpoint_prefix) print('\nTrain time for epoch #%d (step %d): %f' % (checkpoint.save_counter.numpy(), checkpoint.step_counter.numpy(), end - start))
def freeze_model(): x = tf.random_normal((1, 300, 400, 3)) model = ConvModule(64, kernel_size=(3, 3)) adam = tf.train.AdamOptimizer() checkpoint_prefix = os.path.join(flags.model_dir, 'ckpt') global_step = tf.train.get_or_create_global_step() y = model(x) print("y:", y.shape) checkpoint = tfe.Checkpoint(model=model, optimizer=adam, step_counter=global_step) checkpoint.restore(tf.train.latest_checkpoint(flags.model_dir)) print("Global_step:", global_step) checkpoint.save(checkpoint_prefix)
def train_model(train_file, validation_file, validation_interval, width, height, batch_size, n_epochs, checkpoint_folder, training_device): checkpoint_folder = os.path.join(checkpoint_folder, f'{width}x{height}') training_generator = LSUNGenerator(train_file) transform = LSUNTransform(image_dimensions=(height, width, 3)) encoder = Encoder() decoder = Decoder() optimizer = tf.train.AdamOptimizer(learning_rate=0.001) checkpointer = tfe.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer) best_loss = 1e10 for epoch in range(n_epochs): iteration = 0 dataset = tf.data.Dataset.from_generator(generator=lambda: training_generator, output_types=tf.string) \ .map(transform).batch(batch_size) for batch in dataset: with tf.device(training_device): loss, grads_and_vars = calculate_gradients(batch, encoder, decoder) optimizer.apply_gradients(grads_and_vars) iteration += 1 training_logger.info(f'Epoch = {epoch}, Iteration = {iteration}, Loss = {loss}') if iteration % validation_interval == 0: validation_logger.info(f'Epoch: {epoch}, Iteration: {iteration}. Beginning validation pass...') validation_generator = LSUNGenerator(validation_file) validation_dataset = tf.data.Dataset.from_generator(generator=lambda: validation_generator, output_types=tf.string) \ .map(transform).batch(batch_size) losses = list() for val_batch in validation_dataset: with tf.device(training_device): val_batch = tf.constant(val_batch) loss = evaluate(val_batch, encoder, decoder) losses.append(loss) losses = np.array(losses) avg_loss = np.mean(losses) min_loss = np.min(losses) max_loss = np.max(losses) std_loss = np.std(losses) validation_logger.info(f'avg: {avg_loss}, std: {std_loss}, min: {min_loss}, max: {max_loss}') if avg_loss < best_loss: best_loss = avg_loss validation_logger.info( f'Validation loss is best seen so far. Checkpointing to {checkpoint_folder}...') checkpointer.save(checkpoint_folder)
def train(self, steps=300, name=None): """ :param steps: :param name: :return: the loss history """ str2weights = { str(key): value for key, value in self.rule_weights.items() } if name: checkpoint = tfe.Checkpoint(**str2weights) checkpoint_dir = "./model/" + name checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") try: checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) except Exception as e: print(e) losses = [] optimizer = tf.train.RMSPropOptimizer(learning_rate=0.5) for i in range(steps): grads = self.grad() optimizer.apply_gradients( zip(grads, self.__all_variables()), global_step=tf.train.get_or_create_global_step()) loss_avg = float(self.loss().numpy()) losses.append(loss_avg) print("-" * 20) print("step " + str(i) + " loss is " + str(loss_avg)) if i % 5 == 0: self.show_definition() valuation_dict = self.valuation2atoms(self.deduction()).items() for atom, value in valuation_dict: print(str(atom) + ": " + str(value)) if name: checkpoint.save(checkpoint_prefix) pd.Series(np.array(losses)).to_csv(name + ".csv") print("-" * 20 + "\n") return losses
def train(): mp.set_start_method('spawn', force=True) config = Config("config/config.json") env = make_atari(config.env) # env = gym.make(config.env) step = tf.train.get_or_create_global_step() optimizer = tf.train.AdamOptimizer(learning_rate=config.lr) model = ActorCritic(env.action_space.n, policy=config.policy) saver = tfe.Checkpoint(optimizer=optimizer, model=model.policy, optimizer_step=step) pool = mp.Pool(processes=config.processes, initializer=initialize_process) saver.restore(tf.train.latest_checkpoint(config.save_dir)) logger = Logger('global') # Initialize model. model.forward([env.reset()]) ts = time.time() for t in range(config.steps): gradients = [] roll = pool.map(generate_gradients, [t] * config.processes) for tup in zip(*roll): averaged = np.mean(tup, axis=0) gradients.append(tf.constant(averaged, dtype=tf.float32)) clipped, _ = tf.clip_by_global_norm(gradients, config.max_norm) gnorms = [tf.norm(grad) for grad in clipped] logger.log_gradients(gnorms) logger.log_weights(model.policy.trainable_variables) optimizer.apply_gradients(zip(clipped, model.policy.trainable_weights), global_step=step) saver.save(file_prefix=config.file_prefix) print("Epoch took: {}".format(time.time() - ts)) ts = time.time()
def train(self, steps=6000, name="test"): str2weights = {str(key):value for key,value in self.rule_weights.items()} checkpoint = tfe.Checkpoint(**str2weights) optimizer = tf.train.RMSPropOptimizer(learning_rate=0.5) checkpoint_dir = "./model/"+name checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") try: checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir)) except Exception as e: print(e) for i in range(steps): grads = self.grad() optimizer.apply_gradients(zip(grads, self.rule_weights.values()), global_step=tf.train.get_or_create_global_step()) loss_avg = self.loss() print("-"*20) print("step "+str(i)+" loss is "+str(loss_avg)) if i%5==0: self.show_definition() for atom, value in self.valuation2atoms(self.deduction()).items(): print(str(atom)+": "+str(value)) checkpoint.save(checkpoint_prefix) print("-"*20+"\n")
def call_umm_segmentation(features, pad, contiguous, random_wins): ''' Parameters ---------- list of features in size (128,201) length of padding number of contiguous segments [(start,end)] for all the random windows ''' model = CRNN.Model(utils.hidden_dim, utils.num_layers, utils.input_dim) # load checkpoint checkpoint_prefix = os.path.join(utils.model_dir, utils.model_name) step_counter = tf.train.get_or_create_global_step() checkpoint = tfe.Checkpoint(model=model, step_counter=step_counter) if tf.train.checkpoint_exists(checkpoint_prefix): checkpoint.restore(checkpoint_prefix) norm_feats = normalization(tf.convert_to_tensor(features)) logit = model(norm_feats, training=False) time_segments = compute_timeline(logit, pad, contiguous, random_wins) return time_segments
def train_or_infer_spinn(embed, word2index, train_data, dev_data, test_data, config): """Perform Training or Inference on a SPINN model. Args: embed: The embedding matrix as a float32 numpy array with shape [vocabulary_size, word_vector_len]. word_vector_len is the length of a word embedding vector. word2index: A `dict` mapping word to word index. train_data: An instance of `data.SnliData`, for the train split. dev_data: Same as above, for the dev split. test_data: Same as above, for the test split. config: A configuration object. See the argument to this Python binary for details. Returns: If `config.inference_premise ` and `config.inference_hypothesis` are not `None`, i.e., inference mode: the logits for the possible labels of the SNLI data set, as a `Tensor` of three floats. else: The trainer object. Raises: ValueError: if only one of config.inference_premise and config.inference_hypothesis is specified. """ # TODO(cais): Refactor this function into separate one for training and # inference. use_gpu = tfe.num_gpus() > 0 and not config.force_cpu device = "gpu:0" if use_gpu else "cpu:0" print("Using device: %s" % device) if ((config.inference_premise and not config.inference_hypothesis) or (not config.inference_premise and config.inference_hypothesis)): raise ValueError( "--inference_premise and --inference_hypothesis must be both " "specified or both unspecified, but only one is specified.") if config.inference_premise: # Inference mode. inference_sentence_pair = [ data.encode_sentence(config.inference_premise, word2index), data.encode_sentence(config.inference_hypothesis, word2index)] else: inference_sentence_pair = None log_header = ( " Time Epoch Iteration Progress (%Epoch) Loss Dev/Loss" " Accuracy Dev/Accuracy") log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} {} " "{:12.4f} {}") dev_log_template = ( "{:>6.0f} {:>5.0f} {:>9.0f} {:>5.0f}/{:<5.0f} {:>7.0f}% {:>8.6f} " "{:8.6f} {:12.4f} {:12.4f}") summary_writer = tf.contrib.summary.create_file_writer( config.logdir, flush_millis=10000) with tf.device(device), \ summary_writer.as_default(), \ tf.contrib.summary.always_record_summaries(): model = SNLIClassifier(config, embed) global_step = tf.train.get_or_create_global_step() trainer = SNLIClassifierTrainer(model, config.lr) checkpoint = tfe.Checkpoint(trainer=trainer, global_step=global_step) checkpoint.restore(tf.train.latest_checkpoint(config.logdir)) if inference_sentence_pair: # Inference mode. prem, prem_trans = inference_sentence_pair[0] hypo, hypo_trans = inference_sentence_pair[1] hypo_trans = inference_sentence_pair[1][1] inference_logits = model( tf.constant(prem), tf.constant(prem_trans), tf.constant(hypo), tf.constant(hypo_trans), training=False) inference_logits = inference_logits[0][1:] max_index = tf.argmax(inference_logits) print("\nInference logits:") for i, (label, logit) in enumerate( zip(data.POSSIBLE_LABELS, inference_logits)): winner_tag = " (winner)" if max_index == i else "" print(" {0:<16}{1:.6f}{2}".format(label + ":", logit, winner_tag)) return inference_logits train_len = train_data.num_batches(config.batch_size) start = time.time() iterations = 0 mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() print(log_header) for epoch in xrange(config.epochs): batch_idx = 0 for label, prem, prem_trans, hypo, hypo_trans in _get_dataset_iterator( train_data, config.batch_size): if use_gpu: label, prem, hypo = label.gpu(), prem.gpu(), hypo.gpu() # prem_trans and hypo_trans are used for dynamic control flow and can # remain on CPU. Same in _evaluate_on_dataset(). iterations += 1 batch_train_loss, batch_train_logits = trainer.train_batch( label, prem, prem_trans, hypo, hypo_trans) batch_size = tf.shape(label)[0] mean_loss(batch_train_loss.numpy(), weights=batch_size.gpu() if use_gpu else batch_size) accuracy(tf.argmax(batch_train_logits, axis=1), label) if iterations % config.save_every == 0: checkpoint.save(os.path.join(config.logdir, "ckpt")) if iterations % config.dev_every == 0: dev_loss, dev_frac_correct = _evaluate_on_dataset( dev_data, config.batch_size, trainer, use_gpu) print(dev_log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss.result(), dev_loss, accuracy.result() * 100.0, dev_frac_correct * 100.0)) tf.contrib.summary.scalar("dev/loss", dev_loss) tf.contrib.summary.scalar("dev/accuracy", dev_frac_correct) elif iterations % config.log_every == 0: mean_loss_val = mean_loss.result() accuracy_val = accuracy.result() print(log_template.format( time.time() - start, epoch, iterations, 1 + batch_idx, train_len, 100.0 * (1 + batch_idx) / train_len, mean_loss_val, " " * 8, accuracy_val * 100.0, " " * 12)) tf.contrib.summary.scalar("train/loss", mean_loss_val) tf.contrib.summary.scalar("train/accuracy", accuracy_val) # Reset metrics. mean_loss = tfe.metrics.Mean() accuracy = tfe.metrics.Accuracy() batch_idx += 1 if (epoch + 1) % config.lr_decay_every == 0: trainer.decay_learning_rate(config.lr_decay_by) test_loss, test_frac_correct = _evaluate_on_dataset( test_data, config.batch_size, trainer, use_gpu) print("Final test loss: %g; accuracy: %g%%" % (test_loss, test_frac_correct * 100.0)) return trainer
from __future__ import absolute_import, division, print_function import os import matplotlib.pyplot as plt import tensorflow as tf import tensorflow.contrib.eager as tfe tf.enable_eager_execution() from iris import Iris iris = Iris() model = iris.model optimizer = tf.train.AdamOptimizer(learning_rate=0.001) checkpoint_dir = './iris_model' checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt") root = tfe.Checkpoint(optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) root.restore(tf.train.latest_checkpoint(checkpoint_dir)) iris.test()
def main(): open('output_summary.csv', 'w').close() # Constants variables NUM_TRAIN_SAMPLES = 72485 NUM_TEST_SAMPLES = 26528 # Editable variables num_labeled_samples = 5126 num_validation_samples = 0 batch_size = 25 epochs = 200 max_learning_rate = 0.003 initial_beta1 = 0.9 final_beta1 = 0.5 checkpoint_directory = './checkpoints/PiModel' tensorboard_logs_directory = './logs/PiModel' # Assign it as tfe.variable since we will change it across epochs learning_rate = tfe.Variable(max_learning_rate) beta_1 = tfe.Variable(initial_beta1) outputArr = np.array([]) # Download and Save Dataset in Tfrecords #loader = SvnhLoader('./data', NUM_TRAIN_SAMPLES, # num_validation_samples, num_labeled_samples) #loader.download_images_and_generate_tf_record() loader = FnLoader('./fn_data', NUM_TRAIN_SAMPLES, num_validation_samples, num_labeled_samples) # print ("hello") loader.download_images_and_generate_tf_record() #sys.exit() # Generate data loaders train_labeled_iterator, train_unlabeled_iterator, validation_iterator, test_iterator = loader.load_dataset( batch_size, epochs) #print (train_labeled_iterator) batches_per_epoch = int(num_labeled_samples / batch_size) batches_per_epoch_val = int(num_validation_samples / batch_size) # sys.exit() model = PiModel() optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta_1, beta2=0.999) max_unsupervised_weight = 100 * num_labeled_samples / \ (NUM_TRAIN_SAMPLES - num_validation_samples) best_val_accuracy = 0 global_step = tf.train.get_or_create_global_step() writer = tf.contrib.summary.create_file_writer(tensorboard_logs_directory) writer.set_as_default() #sys.exit() for epoch in range(epochs): rampdown_value = ramp_down_function(epoch, epochs) rampup_value = ramp_up_function(epoch) if epoch == 0: unsupervised_weight = 0 else: unsupervised_weight = max_unsupervised_weight * \ rampup_value learning_rate.assign(rampup_value * rampdown_value * max_learning_rate) beta_1.assign(rampdown_value * initial_beta1 + (1.0 - rampdown_value) * final_beta1) epoch_loss_avg = tfe.metrics.Mean() epoch_accuracy = tfe.metrics.Accuracy() epoch_loss_avg_val = tfe.metrics.Mean() epoch_accuracy_val = tfe.metrics.Accuracy() for batch_nr in range(batches_per_epoch): X_labeled_train, y_labeled_train = train_labeled_iterator.get_next( ) #print(y_labeled_train[0:20,0]) #print(y_labeled_train[0:20,1]) #print(y_labeled_train.shape) X_unlabeled_train, _ = train_unlabeled_iterator.get_next() loss_val, grads = pi_model_gradients(X_labeled_train, y_labeled_train, X_unlabeled_train, model, unsupervised_weight) optimizer.apply_gradients(zip(grads, model.variables), global_step=global_step) #sys.exit() epoch_loss_avg(loss_val) #print(X_labeled_train) num_test_batches = int(NUM_TEST_SAMPLES / batch_size) pred = model(X_labeled_train) #sys.exit() outputArr = np.array([]) epoch_accuracy(tf.argmax(pred, 1), tf.argmax(y_labeled_train, 1)) if (batch_nr == batches_per_epoch - 1): for test_batch in range(num_test_batches): X_val, y_val = test_iterator.get_next() y_val_predictions = model(X_val, training=False) y_pred = tf.argmax(y_val_predictions, 1) y_true = tf.argmax(y_val, 1) y_pred_epoch = np.asarray(y_pred) y_true_epoch = np.asarray(y_true) #print(y_pred, y_true) prec_epch = sk.metrics.precision_score( y_true_epoch, y_pred_epoch) rec_epch = sk.metrics.recall_score(y_true_epoch, y_pred_epoch) f1_epch = sk.metrics.f1_score(y_true_epoch, y_pred_epoch) epoch_loss_avg_val( tf.losses.softmax_cross_entropy( y_val, y_val_predictions)) epoch_accuracy_val(tf.argmax(y_val_predictions, 1), tf.argmax(y_val, 1)) #value1 = epoch+1 #value2 = epoch_accuracy.result() #value3 = #value4 = #value5 = #value6 = #arrResult = [epoch+1, epoch_accuracy.result(), epoch_accuracy_val, a, b, c ] arrResult = "{:03d}, {:02.6%}, {:02.6%}, {:.4%}, {:.4%}, {:.4%} ".format( epoch + 1, epoch_accuracy.result(), epoch_accuracy_val.result(), prec_epch, rec_epch, f1_epch) out = open('output_summary.csv', 'a+') out.write(arrResult + '\n') #writef = csv.writer(out, delimiter=' ') #writef.writerow(arrResult) # print("Epoch {:03d}/{:03d}: Train Loss: {:9.7f}, Train Accuracy: {:02.6%}, Validation Loss: {:9.7f}, " # "Validation Accuracy: {:02.6%}, lr={:.9f}, unsupervised weight={:5.3f}, beta1={:.9f}".format(epoch+1, # epochs, # epoch_loss_avg.result(), # epoch_accuracy.result(), # epoch_loss_avg_val.result(), # epoch_accuracy_val.result(), # learning_rate.numpy(), # unsupervised_weight, # beta_1.numpy())) print( "Epoch {:03d}/{:03d}: Train Loss: {:9.7f}, Train Accuracy: {:02.6%}, lr={:.9f}, unsupervised weight={:5.3f}, beta1={:.9f}" .format(epoch + 1, epochs, epoch_loss_avg.result(), epoch_accuracy.result(), learning_rate.numpy(), unsupervised_weight, beta_1.numpy())) print(epoch_accuracy_val) #print (epoch_accuracy.result()) # If the accuracy of validation improves save a checkpoint Best 85% if best_val_accuracy < epoch_accuracy.result(): best_val_accuracy = epoch_accuracy.result() checkpoint = tfe.Checkpoint(optimizer=optimizer, model=model, optimizer_step=global_step) checkpoint.save(file_prefix=checkpoint_directory) # Record summaries #with tf.contrib.summary.record_summaries_every_n_global_steps(1): # tf.contrib.summary.scalar('Train Loss', epoch_loss_avg.result()) # tf.contrib.summary.scalar( # 'Train Accuracy', epoch_accuracy.result()) # tf.contrib.summary.scalar( # 'Validation Loss', epoch_loss_avg_val.result()) # tf.contrib.summary.scalar( # 'Validation Accuracy', epoch_accuracy_val.result()) # tf.contrib.summary.scalar( # 'Unsupervised Weight', unsupervised_weight) # tf.contrib.summary.scalar('Learning Rate', learning_rate.numpy()) # tf.contrib.summary.scalar('Ramp Up Function', rampup_value) # tf.contrib.summary.scalar('Ramp Down Function', rampdown_value) #print('\nTrain Ended! Best Validation accuracy = {}\n'.format(best_val_accuracy)) #sys.exit() # Load the best model root = tfe.Checkpoint(optimizer=optimizer, model=model, optimizer_step=tf.train.get_or_create_global_step()) root.restore(tf.train.latest_checkpoint(checkpoint_directory)) # Evaluate on the final test set #num_test_batches = NUM_TEST_SAMPLES/batch_size test_accuracy = tfe.metrics.Accuracy() #recall_eval = tf.metrics.recall(y_test_predictions, y_test) #precision_eval = tf.metrics.precision(y_test_predictions, y_test) for test_batch in range(int(num_test_batches)): X_test, y_test = test_iterator.get_next() #print(y_test[0:20,1]) y_test_predictions = model(X_test, training=False) test_accuracy(tf.argmax(y_test_predictions, 1), tf.argmax(y_test, 1)) y_pred = tf.argmax(y_test_predictions, 1) y_true = tf.argmax(y_test, 1) y_pred = np.asarray(y_pred) y_true = np.asarray(y_true) #print(y_pred, y_true) a = sk.metrics.precision_score(y_true, y_pred) b = sk.metrics.recall_score(y_true, y_pred) c = sk.metrics.f1_score(y_true, y_pred) print("Precision", a) print("Recall", b) print("f1_score", c) #print ("confusion_matrix") #print (sk.metrics.confusion_matrix(y_true, y_pred)) #fpr, tpr, tresholds = sk.metrics.roc_curve(y_true, y_pred) #precision_eval = tf.metrics.precision(y_test_predictions, y_test) #precision_eval = tf.contrib.metrics.precision_at_recall(tf.argmax(y_test_predictions, 1), tf.argmax(y_test, 1), 1) print(tf.argmax(y_test_predictions)) print(tf.argmax(y_test)) #f1_score(y_test_predictions, y_test, average='macro') print("Final Test Accuracy: {:.6%}".format(test_accuracy.result()))