def validate(data, model): n_iterations_test = data.test.num_examples // cfg.batch_size saver = tf.train.Saver() checkpoint_path = u.get_checkpoint_path() with tf.Session() as sess: saver.restore(sess, checkpoint_path) loss_tests = [] acc_tests = [] for iteration in range(1, n_iterations_test + 1): X_batch, y_batch = data.test.next_batch(cfg.batch_size) loss_test, acc_test = sess.run( [model.loss, model.accuracy], feed_dict={ model.X: X_batch.reshape( [-1, data.image_axis_size, data.image_axis_size, 1]), model.y: y_batch }) loss_tests.append(loss_test) acc_tests.append(acc_test) print("\rEvaluating the model: {}/{} ({:.1f}%)".format( iteration, n_iterations_test, iteration * 100 / n_iterations_test), end=" " * 10) loss_test = np.mean(loss_tests) acc_test = np.mean(acc_tests) print("\rFinal test accuracy: {:.4f}% Loss: {:.6f}".format( acc_test * 100, loss_test))
def evaluate_stats(model_name): checkpoint_path = get_checkpoint_path(model_name) output_dir = os.path.join(STATS_OUTPUT_DIR, model_name) os.makedirs(output_dir, exist_ok=True) net = FinalResnet() net = net.to(DEVICE) exp = ExperimentStatistics(net, output_dir, checkpoint_path, is_adaptive=ADAPTIVE) try: exp.optimizer_stats() except BaseException as e: print('Error in saving OPTIMIZER stats for {}: {}'.format( checkpoint_path, e)) try: exp.parent_epoch_losses() except BaseException as e: print('Error saving PARENT EPOCH statistics for {}: {}'.format( checkpoint_path, e)) try: exp.iteration_stats() except BaseException as e: print('Error saving ITERATION statistics for {}: {}'.format( checkpoint_path, e)) try: exp.network_performance(metric_names=['loss'], calc_train=False) except BaseException as e: print('Error saving NETWORK PERFORMANCE statistics for {}: {}'.format( checkpoint_path, e))
def make_init_fn(self, chpt_path): if chpt_path is None: chpt_path = get_checkpoint_path(self.get_save_dir()) if chpt_path is None: print('No checkpoint found for initialization') return None else: print('Initializing from previous checkpoint: {}'.format( chpt_path)) else: print( 'Initializing from provided checkpoint: {}'.format(chpt_path)) var2restore = slim.get_variables_to_restore( exclude=self.exclude_scopes) print('Variables to restore: {}'.format( [v.op.name for v in var2restore])) var2restore = remove_missing(var2restore, chpt_path) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( chpt_path, var2restore) sys.stdout.flush() # Create an initial assignment function. def init_fn(sess): sess.run(init_assign_op, init_feed_dict) return init_fn
class final_model: encoder, preprocess_for_model = featureExtraction.get_cnn_encoder() train.saver.restore(train.s, utils.get_checkpoint_path()) lstm_c = tf.Variable(tf.zeros([1, LSTM_UNITS]), name="cell") lstm_h = tf.Variable(tf.zeros([1, LSTM_UNITS]), name="hidden") input_images = tf.placeholder('float32', [1, IMG_SIZE, IMG_SIZE, 3], name='images') img_embeds = encoder(input_images) init_c = init_h = decoder.img_embed_bottleneck_to_h0( decoder.img_embed_to_bottleneck(img_embeds)) init_lstm = tf.assign(lstm_c, init_c), tf.assign(lstm_h, init_h) current_word = tf.placeholder('int32', [1], name='current_input') word_embed = decoder.word_embed(current_word) new_c, new_h = decoder.lstm(word_embed, tf.nn.rnn_cell.LSTMStateTuple(lstm_c, lstm_h))[1] new_logits = decoder.token_logits(decoder.token_logits_bottleneck(new_h)) new_probs = tf.nn.softmax(new_logits) one_step = new_probs, tf.assign(lstm_c, new_c), tf.assign(lstm_h, new_h)
def train(env, agent): file_writer = get_file_writer(model_name=model_name, session=session) checkpoint_path = get_checkpoint_path(model_name=model_name) running = True done = False iteration = 0 n_games = 0 mean_score = 0 EMA = 0 alpha = 0.005 with session: training_start = agent.start(checkpoint_path) while running: iteration += 1 if should_display and n_games % 100 == 0: env.display() if done: # Game over, start a new game n_games += 1 if n_games == 1: EMA = env.snake.total else: EMA = alpha * env.snake.total + (1 - alpha) * EMA env.reset() mean_score = env.total_rewards / n_games for event in pygame.event.get( ): # Stop the program if we quit the game if event.type == pygame.QUIT: running = False observation = env.screenshot() cur_state = env.get_last_frames(observation) step = agent.global_step.eval() action, epsilon = agent.act(cur_state, step) new_state, reward, done = env.step(action) agent.remember(cur_state, action, reward, new_state, done) # Only train at regular intervals if iteration < training_start or iteration % training_interval != 0: continue # Train the agent agent.train(checkpoint_path, file_writer, mean_score) if iteration % 500 == 0: print( "\rTraining step {}/{} ({:.1f})%\t Record {:.2f} \t Mean score {:.2f} \t EMA {:.2f} \t epsilon {:.2f}" .format(step, n_steps, step * 100 / n_steps, env.record, mean_score, EMA, epsilon), end="") if step > n_steps: break
def test_classifier(self, ckpt_dir): print('Restoring from: {}'.format(ckpt_dir)) g = tf.Graph() with g.as_default(): # Get test batches batch_queue = self.get_data_queue() imgs_test, labels_test = batch_queue.get_next() imgs_test.set_shape([ self.model.batch_size, ] + self.model.im_shape) # Get predictions predictions = self.model.linear_classifiers( imgs_test, self.data_generator.num_classes, training=False) num_corrects_list = [] for preds, f_id in zip(predictions, self.model.feats_IDs): preds_test = tf.argmax(preds, 1) correct_preds = tf.equal(preds_test, labels_test) num_correct = tf.reduce_sum(tf.to_float(correct_preds)) num_corrects_list.append(num_correct) # Start running operations on the Graph. init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) prev_ckpt = get_checkpoint_path(ckpt_dir) print('Restoring from previous checkpoint: {}'.format(prev_ckpt)) saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, prev_ckpt) n_cor_np = np.zeros([len(self.model.feats_IDs)]) for i in range(self.num_eval_steps): n_correct = sess.run(num_corrects_list) n_cor_np += n_correct acc = n_cor_np / self.data_generator.num_samples print('Accuracy: {}'.format(acc)) return acc
def test_gan(self, num_comp=10000, ckpt=None): if not ckpt: ckpt = get_checkpoint_path(self.get_save_dir()) f_act, r_act, f_log = self.get_activations(num_comp, ckpt) g = tf.Graph() with g.as_default(): # Placeholders for FID a_shape = f_act.shape real_acts = tf.placeholder(tf.float32, shape=a_shape) fake_acts = tf.placeholder(tf.float32, shape=a_shape) l_shape = f_log.shape fake_logs = tf.placeholder(tf.float32, shape=l_shape) # Compute Frechet Inception Distance. fid = tfgan.eval.frechet_classifier_distance_from_activations( real_acts, fake_acts) i_s = tfgan.eval.classifier_score_from_logits(fake_logs) sess = tf.Session(graph=g) return sess.run([fid, i_s], feed_dict={real_acts: r_act, fake_acts: f_act, fake_logs: f_log})
def show_images(data, model): checkpoint_path = u.get_checkpoint_path() n_samples = 5 sample_images = data.test.images[:n_samples].reshape( [-1, data.image_axis_size, data.image_axis_size, 1]) with tf.Session() as sess: model.saver.restore(sess, checkpoint_path) caps2_output_value, decoder_output_value, y_pred_value = sess.run( [model.caps2_output, model.decoder_output, model.y_pred], feed_dict={ model.X: sample_images, model.y: np.array([], dtype=np.int64) }) sample_images = sample_images.reshape(-1, data.image_axis_size, data.image_axis_size) reconstructions = decoder_output_value.reshape( [-1, data.image_axis_size, data.image_axis_size]) plt.figure(figsize=(n_samples * 2, 3)) for index in range(n_samples): plt.subplot(1, n_samples, index + 1) plt.imshow(sample_images[index], cmap="binary") plt.title("L:" + str(data.test.labels[index])) plt.axis("off") plt.show() plt.figure(figsize=(n_samples * 2, 3)) for index in range(n_samples): plt.subplot(1, n_samples, index + 1) plt.title("P:" + str(y_pred_value[index])) plt.imshow(reconstructions[index], cmap="binary") plt.axis("off") plt.show()
def make_init_fn(self, chpt_path): if chpt_path is None: ae_chpt_dir = os.path.join( LOG_DIR, '{}_{}/'.format(self.model.ae.name, self.dataset.name)) chpt_path = get_checkpoint_path(ae_chpt_dir) var2restore = slim.get_variables_to_restore( include=self.restore_scopes) print('Variables to restore: {}'.format( [v.op.name for v in var2restore])) var2restore = remove_missing(var2restore, chpt_path) init_assign_op, init_feed_dict = slim.assign_from_checkpoint( chpt_path, var2restore) sys.stdout.flush() # Create an initial assignment function. def init_fn(sess): print('Restoring from: {}'.format(chpt_path)) sess.run(init_assign_op, init_feed_dict) return init_fn
def train(experiment_name, pretrained_model_name=None, stats_manager=StatsManager()): net = FinalResnet() if pretrained_model_name is not None: checkpoint_path = get_checkpoint_path(pretrained_model_name) data = torch.load(checkpoint_path, map_location=DEVICE) else: data = None adver_net = AdverserialNetwork() exp = AdaptiveExperiment(net, adver_net, stats_manager, output_dir=experiment_name, perform_validation_during_training=False, pretrained_data=data) exp.run(num_epochs=ROOT_CONFIG['num_epochs'], plot=lambda e: report_loss(e))
def make_init_fn(self, chpt_path): if self.num_conv2init == 0: return None else: if chpt_path is None: fname = '{}_{}'.format(self.model.name, self.dataset.name) chpt_path = get_checkpoint_path( os.path.join(LOG_DIR, '{}/'.format(fname))) # Specify the layers of the model you want to exclude var2restore = [] for i in range(self.num_conv2init): vs = slim.get_variables_to_restore( include=['discriminator/conv_{}'.format(i + 1)], exclude=['discriminator/fully_connected']) var2restore += vs init_fn = assign_from_checkpoint_fn(chpt_path, var2restore, ignore_missing_vars=True) print('Variables to restore: {}'.format( [v.op.name for v in var2restore])) sys.stdout.flush() return init_fn
def main(): args = parse_args() valid_modes_list = utils.get_valid_game_modes() valid_modes_string = utils.get_valid_game_modes_string() if args.mode not in valid_modes_list: print('Invalid game mode informed. Please inform a mode with ' + '--mode=mode_name, where mode_name is one of the following ' + '{%s}' % valid_modes_string) sys.exit() gconf = utils.get_game_config(args.mode, 'challenge') if args.num_challenges > 0: gconf.num_challenges = args.num_challenges if args.game_type == 'moku': (game_config_string, game_manager_module, game_manager_kwargs, game_manager_io_module, game_manager_io_kwargs) = \ utils.generate_moku_manager_params( gconf.drop_mode, gconf.moku_size, gconf.board_size, args.gpu_id, gconf.num_res_layers, gconf.num_channels) else: raise NotImplementedError('Game type %s is not supported.' % args.game_type) train_dir = osp.join('train_files', game_config_string) for i in range(len(args.num_iters_ckpt)): x = int(args.num_iters_ckpt[i]) if x < 0: x = utils.get_last_checkpoint_number(train_dir) args.num_iters_ckpt[i] = x ckpt_paths = [ utils.get_checkpoint_path(train_dir, x) for x in args.num_iters_ckpt ] gmio_module = __import__(game_manager_io_module[0]) gmio_class = getattr(gmio_module, game_manager_io_module[1]) game_manager_io = gmio_class(**game_manager_io_kwargs) gm_module = __import__(game_manager_module[0]) gm_class = getattr(gm_module, game_manager_module[1]) ip1 = 0 args.max_simulations_per_move = [ int(x) for x in args.max_simulations_per_move ] local_max_simulations_per_move = args.max_simulations_per_move if len(local_max_simulations_per_move) == 1: local_max_simulations_per_move = \ local_max_simulations_per_move * len(args.num_iters_ckpt) elif len(local_max_simulations_per_move) != len(args.num_iters_ckpt): print('Number of arguments in max_simulations_per_move and ' + 'num_iters_ckpt do not match. See --help for more information.') sys.exit() local_eval_batch_size = [0] * len(local_max_simulations_per_move) for i in range(len(local_max_simulations_per_move)): if local_max_simulations_per_move[i] < 1: local_max_simulations_per_move[i] = gconf.max_simulations_per_move local_eval_batch_size[i] = gconf.eval_batch_size else: local_eval_batch_size[i] = \ int(local_max_simulations_per_move[i] / 100.0) + 1 print('Running %d challenges for each pair of checkpoints.' % gconf.num_challenges) results = np.zeros( (2, len(args.num_iters_ckpt), len(args.num_iters_ckpt), 3), np.int32) for ichallenge in range(gconf.num_challenges): iend_ckpt1 = len(args.num_iters_ckpt) - 1 if args.include_self_play: iend_ckpt1 += 1 for ickpt1 in range(iend_ckpt1): istart_ckpt2 = ickpt1 + 1 if args.include_self_play: istart_ckpt2 -= 1 for ickpt2 in range(istart_ckpt2, len(args.num_iters_ckpt)): chal_ckpt_nums = [ args.num_iters_ckpt[ickpt1], args.num_iters_ckpt[ickpt2] ] chal_ckpt_paths = [ckpt_paths[ickpt1], ckpt_paths[ickpt2]] chal_max_simulations_per_move = [ local_max_simulations_per_move[ickpt1], local_max_simulations_per_move[ickpt2] ] chal_eval_batch_size = [ local_eval_batch_size[ickpt1], local_eval_batch_size[ickpt2] ] print('=====================================================') print('Checkpoint %d vs. %d' % tuple(chal_ckpt_nums)) print('=====================================================') game_managers = [] for i, ckpt in enumerate(chal_ckpt_paths): print('Net %d' % (i + 1)) game_manager_kwargs['ckpt_path'] = ckpt game_managers.append(gm_class(**game_manager_kwargs)) print('=====================================================') print() state = game_managers[0].initial_state() mctss = [ MCTS(game_managers[i], chal_max_simulations_per_move[i], gconf.cpuct, gconf.virtual_loss, state, gconf.root_noise_weight, gconf.dirichlet_noise_param, chal_eval_batch_size[i], game_manager_kwargs['tf_device']) for i in range(len(game_managers)) ] iplayer = ip1 iplay = 0 moves = [] imove = None while not game_managers[iplayer].is_over( state.state[np.newaxis])[0]: if iplay < gconf.num_relaxed_turns: turn_temperature = 1.0 else: turn_temperature = gconf.move_temperature imc = iplayer % len(mctss) if args.show_middle_game: game_manager_io.print_board(state, imove) stats = mctss[imc].simulate(state, gconf.max_seconds_per_move) print('Net %d to play:' % (iplayer + 1)) if args.show_mcts: print('MCTS stats') game_manager_io.print_stats(stats) print() if args.show_win_prob: with tf.device(game_manager_kwargs['tf_device']): _, value_prior = \ mctss[imc].game_manager.predict( tf.constant(state.state[np.newaxis], tf.float32)) win_prob = (value_prior[0] + 1.0) / 2.0 print('Estimated win probability: %.03f\n' % win_prob) if args.show_move_prob: print('Move probabilities:') game_manager_io.print_stats_on_board(stats, 1) print() if args.show_move_prob_temp: print('Move probabilities with temperature ' + '%.1e' % turn_temperature) game_manager_io.print_stats_on_board( stats, turn_temperature) print() imove, _ = mctss[imc].choose_move(turn_temperature) moves.append((imove, iplayer)) state = game_managers[iplayer].update_state(state, imove) iplayer = (iplayer + 1) % 2 for imc2 in range(len(mctss)): mctss[imc2].update_root(imove, state) iplay += 1 game_manager_io.print_board(state, imove) iwinner = game_managers[iplayer].get_iwinner( state.state[np.newaxis])[0] print('Checkpoint %d vs. %d result (match %d):' % tuple(chal_ckpt_nums + [ichallenge + 1])) if iwinner < 0: print('DRAW') results[0, ickpt1, ickpt2, 2] += 1 results[1, ickpt1, ickpt2, 2] += 1 elif iwinner == ip1: print('Checkpoint %d won' % args.num_iters_ckpt[ickpt1]) results[ip1, ickpt1, ickpt2, 0] += 1 results[(ip1 + 1) % 2, ickpt2, ickpt1, 1] += 1 else: print('Checkpoint %d won' % args.num_iters_ckpt[ickpt2]) results[(ip1 + 1) % 2, ickpt2, ickpt1, 0] += 1 results[ip1, ickpt1, ickpt2, 1] += 1 print('\nNumber of wins of the players in the rows vs. the ' + 'players in the columns. Missing results are draws.\n') print_results(np.sum(results[:, :, :, 0], axis=0), args.num_iters_ckpt) if args.show_results_by_player: print('Results when playing as player 1.\n') print_results(results[0, :, :, 0], args.num_iters_ckpt) print('Results when playing as player 2.\n') print_results(results[1, :, :, 0], args.num_iters_ckpt) ip1 = (ip1 + 1) % 2
env.reset() episode += 1 # Increment the number of games played iterations_without_progress = 0 best_total = 0 return games_scores if __name__ == '__main__': args = parser.parse_args() n_games = args.numberOfGames slow_down_factor = args.slowDownFactor model_name = args.modelName checkpoint_path = get_checkpoint_path(model_name=model_name) if os.path.isfile(checkpoint_path + ".index"): # Check to see if the model exists games_scores = make_agent_play_games(n_games, slow_down_factor) mean_score = np.mean(games_scores) std = np.std(games_scores) max_score = np.max(games_scores) print( "Max score {:.2f}\tMean score {:.2f}\tStandard deviation {:.2f} ". format(max_score, mean_score, std)) else: raise ValueError( 'Model file does not exist : a model file is required for testing')
def main(): args = parse_args() valid_modes_list = utils.get_valid_game_modes() valid_modes_string = utils.get_valid_game_modes_string() if args.mode not in valid_modes_list: print('Invalid game mode informed. Please inform a mode with ' + '--mode=mode_name, where mode_name is one of the following ' + '{%s}' % valid_modes_string) sys.exit() gconf = utils.get_game_config(args.mode, 'test') if args.game_type == 'moku': (game_config_string, game_manager_module, game_manager_kwargs, game_manager_io_module, game_manager_io_kwargs) = \ utils.generate_moku_manager_params( gconf.drop_mode, gconf.moku_size, gconf.board_size, args.gpu_id, gconf.num_res_layers, gconf.num_channels) else: raise NotImplementedError('Game type %s is not supported.' % args.game_type) train_dir = osp.join('train_files', game_config_string) ckpt_path = utils.get_checkpoint_path(train_dir, args.num_iters_ckpt) game_manager_kwargs['ckpt_path'] = ckpt_path gm_module = __import__(game_manager_module[0]) gm_class = getattr(gm_module, game_manager_module[1]) game_manager = gm_class(**game_manager_kwargs) gmio_module = __import__(game_manager_io_module[0]) gmio_class = getattr(gmio_module, game_manager_io_module[1]) game_manager_io = gmio_class(**game_manager_io_kwargs) state = game_manager.initial_state() mctss = [ MCTS(game_manager, gconf.max_simulations_per_move, gconf.cpuct, gconf.virtual_loss, state, gconf.root_noise_weight, gconf.dirichlet_noise_param, gconf.eval_batch_size, game_manager_kwargs['tf_device']) ] iplayer = 0 iplay = 0 moves = [] last_played_imove = None while not game_manager.is_over(state.state[np.newaxis])[0]: imove = None if iplay < gconf.num_relaxed_turns: turn_temperature = 1.0 else: turn_temperature = gconf.move_temperature imc = iplayer % len(mctss) print('===== New turn =====') game_manager_io.print_board(state, last_played_imove) if args.iuser == 2 or iplayer == args.iuser: # User types a move imove = game_manager_io.get_input(state) if imove == GameManagerIO.IEXIT: break if imove == GameManagerIO.ICOMPUTER_MOVE or \ (args.iuser != 2 and iplayer != args.iuser): # Computer chooses a move stats = mctss[imc].simulate(state, gconf.max_seconds_per_move) if args.show_mcts: print('MCTS stats') game_manager_io.print_stats(stats) print() if args.show_win_prob or imove == GameManagerIO.ICOMPUTER_MOVE: with tf.device(game_manager_kwargs['tf_device']): _, value_prior = game_manager.predict( tf.constant(state.state[np.newaxis], tf.float32)) win_prob = (value_prior[0] + 1.0) / 2.0 print('Estimated win probability: %.03f\n' % win_prob) if args.show_move_prob or imove == GameManagerIO.ICOMPUTER_MOVE: print('Move probabilities:') game_manager_io.print_stats_on_board(stats, 1) print() if args.show_move_prob_temp: print('Move probabilities with temperature ' + '%.1e' % turn_temperature) game_manager_io.print_stats_on_board(stats, turn_temperature) print() if imove == GameManagerIO.ICOMPUTER_MOVE: # If user asked for computer prediction, # escape before actually choosing a move continue imove, _ = mctss[imc].choose_move(turn_temperature) moves.append((imove, iplayer)) last_played_imove = imove state = game_manager.update_state(state, last_played_imove) iplayer = (iplayer + 1) % 2 for imc2 in range(len(mctss)): mctss[imc2].update_root(last_played_imove, state) iplay += 1 if imove == GameManagerIO.IEXIT: print('Game unfinished') else: game_manager_io.print_board(state, imove) iwinner = game_manager.get_iwinner(state.state[np.newaxis]) if iwinner < 0: print('DRAW') else: if args.iuser == 2: print('Player %d WON.' % (iwinner + 1)) elif iwinner == args.iuser: print('You WON!') else: print('You LOST!')
def train_model(self, chpt_path=None): if chpt_path: print('Restoring from: {}'.format(chpt_path)) g = tf.Graph() with g.as_default(): with tf.device('/cpu:0'): # Init global step self.global_step = tf.train.create_global_step() # Init data batch_queue = self.get_data_queue() # Optimizer for the classifier opt_c = self.optimizer_class() # Calculate the gradients for each model tower. train_ops_g = [] train_ops_d = [] tower_grads_c = [] loss_c = 0. loss_g = 0. loss_d = 0. with tf.variable_scope(tf.get_variable_scope()): for i in range(self.num_gpus): with tf.device('/gpu:%d' % i): # LCI parameters are not shared across GPUs opt_g = self.optimizer('g') opt_d = self.optimizer('d') with tf.name_scope('tower_{}'.format(i)) as scope: l_g, l_d, l_c, grad_g, grad_d, grad_c, layers_d = \ self.build_model(batch_queue, opt_g, opt_d, opt_c, scope, i) loss_c += l_c loss_g += l_g loss_d += l_d # Training ops for LCI train_op_g = opt_g.apply_gradients(grad_g) train_op_d = opt_d.apply_gradients(grad_d) train_ops_d.append(train_op_d) train_ops_g.append(train_op_g) # Aggregate gradients for the transformation classifier tower_grads_c.append(grad_c) # Average gradients for classifier from all GPUs grad_c = average_gradients(tower_grads_c) # Make summaries self.make_summaries(grad_d + grad_g + grad_c, layers_d) # Apply the gradients to adjust the shared variables. wd_vars = get_variables_to_train(self.train_scopes) if self.excl_gamma_wd: wd_vars = [v for v in wd_vars if 'gamma' not in v.op.name] if self.excl_beta_wd: wd_vars = [v for v in wd_vars if 'beta' not in v.op.name] print('WD variables: {}'.format([v.op.name for v in wd_vars])) train_op_c = opt_c.apply_gradients( grad_c, global_step=self.global_step, decay_var_list=wd_vars) # Group all updates to into a single train op. train_op = control_flow_ops.with_dependencies( [train_op_c] + train_ops_d + train_ops_g, loss_d + loss_g + loss_c) # Create a saver. saver = tf.train.Saver(tf.global_variables()) init_fn = self.make_init_fn(chpt_path) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(self.summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), graph=g) sess.run(init) prev_ckpt = get_checkpoint_path(self.get_save_dir()) if prev_ckpt: print('Restoring from previous checkpoint: {}'.format( prev_ckpt)) saver.restore(sess, prev_ckpt) elif init_fn: init_fn(sess) summary_writer = tf.summary.FileWriter(self.get_save_dir(), sess.graph) init_step = sess.run(self.global_step) print('Start training at step: {}'.format(init_step)) for step in range(init_step, self.num_train_steps): start_time = time.time() _, loss_value = sess.run([train_op, loss_c]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % (self.num_train_steps // 2000) == 0: num_examples_per_step = self.model.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = duration print( '{}: step {}/{}, loss = {} ({} examples/sec; {} sec/batch)' .format(datetime.now(), step, self.num_train_steps, loss_value, examples_per_sec, sec_per_batch)) sys.stdout.flush() if step % (self.num_train_steps // 200) == 0: print('Writing summaries...') summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % (self.num_train_steps // 40) == 0 or ( step + 1) == self.num_train_steps: checkpoint_path = os.path.join(self.get_save_dir(), 'model.ckpt') print( 'Saving checkpoint to: {}'.format(checkpoint_path)) saver.save(sess, checkpoint_path, global_step=step)
def train(restore_checkpoint, n_epochs, n_iterations_per_epoch, n_iterations_validation, data, model): with tf.Session() as sess: best_loss_val = np.infty saver = tf.train.Saver() checkpoint_path = u.get_checkpoint_path() init = tf.global_variables_initializer() if cfg.use_checkpoint and tf.train.checkpoint_exists(checkpoint_path): saver.restore(sess, checkpoint_path) else: init.run() for epoch in range(n_epochs): trange = tqdm.trange(1, n_iterations_per_epoch + 1) for iteration in trange: X_batch, y_batch = data.train.next_batch(cfg.batch_size) # Run the training operation and measure the loss: _, loss_train = sess.run( [model.training_op, model.loss], feed_dict={ model.X: X_batch.reshape([ -1, data.image_axis_size, data.image_axis_size, 1 ]), model.y: y_batch, model.mask_with_labels: True }) # print("\rIteration: {}/{} ({:.1f}%) Loss: {:.5f}".format( # iteration, n_iterations_per_epoch, # iteration * 100 / n_iterations_per_epoch, # loss_train), # end="") # At the end of each epoch, # measure the validation loss and accuracy: loss_vals = [] acc_vals = [] for iteration in range(1, n_iterations_validation + 1): X_batch, y_batch = data.validation.next_batch(cfg.batch_size) loss_val, acc_val = sess.run( [model.loss, model.accuracy], feed_dict={ model.X: X_batch.reshape([ -1, data.image_axis_size, data.image_axis_size, 1 ]), model.y: y_batch }) loss_vals.append(loss_val) acc_vals.append(acc_val) print("\rEvaluating the model: {}/{} ({:.1f}%)".format( iteration, n_iterations_validation, iteration * 100 / n_iterations_validation), end=" " * 10) loss_val = np.mean(loss_vals) acc_val = np.mean(acc_vals) print("\rEpoch: {} Val accuracy: {:.4f}% Loss: {:.6f}{}".format( epoch + 1, acc_val * 100, loss_val, " (improved)" if loss_val < best_loss_val else "")) # And save the model if it improved: if loss_val < best_loss_val: save_path = saver.save(sess, checkpoint_path) best_loss_val = loss_val
def test_classifier_multicrop(self, ckpt_dir): print('Restoring from: {}'.format(ckpt_dir)) g = tf.Graph() with g.as_default(): # Get test batches batch_queue = self.get_data_queue_multicrop() imgs_test, labels_test = batch_queue.get_next() imgs_test.set_shape((self.model.batch_size, ) + self.pre_processor.src_shape + (3, )) print('imgs_test: {}'.format(imgs_test.get_shape().as_list())) # Extract crops imgs_rcrop = [] dp = int((self.pre_processor.src_shape[0] - self.pre_processor.target_shape[0]) / 2) imgs_ccrop = imgs_test[:, dp:dp + self.pre_processor.target_shape[0], dp:dp + self.pre_processor.target_shape[1], :] imgs_rcrop.append(imgs_ccrop) imgs_ulcrop = imgs_test[:, :self.pre_processor.target_shape[0], : self.pre_processor.target_shape[1], :] imgs_rcrop.append(imgs_ulcrop) imgs_urcrop = imgs_test[:, :self.pre_processor.target_shape[0], -self.pre_processor.target_shape[1]:, :] imgs_rcrop.append(imgs_urcrop) imgs_blcrop = imgs_test[:, -self.pre_processor.target_shape[0]:, :self .pre_processor.target_shape[1], :] imgs_rcrop.append(imgs_blcrop) imgs_brcrop = imgs_test[:, -self.pre_processor.target_shape[0]:, -self.pre_processor.target_shape[1]:, :] imgs_rcrop.append(imgs_brcrop) imgs_rcrop_stack = tf.concat(imgs_rcrop, 0) # Add flipped crops imgs_rcrop_stack = tf.concat( [imgs_rcrop_stack, tf.reverse(imgs_rcrop_stack, [2])], 0) preds_rcrop_stack = self.model.linear_classifiers( imgs_rcrop_stack, self.data_generator.num_classes, training=False) num_corrects_list = [] for preds_stack, f_id in zip(preds_rcrop_stack, self.model.feats_IDs): stack_preds = tf.stack(tf.split(preds_stack, 10)) stack_preds = tf.nn.softmax(stack_preds, axis=-1) preds = tf.reduce_mean(stack_preds, 0) preds_test = tf.argmax(preds, 1) correct_preds = tf.equal(preds_test, labels_test) num_correct = tf.reduce_sum(tf.cast(correct_preds, tf.float32)) num_corrects_list.append(num_correct) # Start running operations on the Graph. init = tf.global_variables_initializer() sess = tf.Session() sess.run(init) prev_ckpt = get_checkpoint_path(ckpt_dir) print('Restoring from previous checkpoint: {}'.format(prev_ckpt)) saver = tf.train.Saver(tf.global_variables()) saver.restore(sess, prev_ckpt) n_cor_np = np.zeros([len(self.model.feats_IDs)]) for i in range(self.num_eval_steps): n_correct = sess.run(num_corrects_list) n_cor_np += n_correct acc = n_cor_np / self.data_generator.num_samples print('Accuracy: {}'.format(acc)) return acc
def test_classifier_multi_crop(self, ckpt_dir): print('Restoring from: {}'.format(ckpt_dir)) self.batch_size = 1 g = tf.Graph() with g.as_default(): tf.compat.v1.random.set_random_seed(123) # Get test batches batch_queue = self.get_data_queue_multi_crop() vids_test, labels_test, ex_test = batch_queue.get_next() vids_test = tf.reshape(vids_test, (-1, ) + self.pre_processor.out_shape) # Center crop the videos vids_test = self.pre_processor.center_crop(vids_test) if self.flip_aug: vids_test = tf.concat( [vids_test, tf.reverse(vids_test, [-2])], 0) # Get predictions preds_test = tf.map_fn( lambda x: self.model.model(tf.expand_dims(x, 0), self.dataset.num_classes, training=False), vids_test) preds_test = tf.reduce_mean(tf.nn.softmax(preds_test), 0, keep_dims=False) print('Preds test: {}'.format(preds_test.get_shape().as_list())) # preds_test = self.model.model(vids_test, self.dataset.num_classes, training=False) preds_test = tf.argmax(preds_test, 1) # Start running operations on the Graph. init = tf.compat.v1.global_variables_initializer() sess = tf.compat.v1.Session() sess.run(init) prev_ckpt = get_checkpoint_path(ckpt_dir) print('Restoring from previous checkpoint: {}'.format(prev_ckpt)) saver = tf.train.Saver(tf.compat.v1.global_variables()) saver.restore(sess, prev_ckpt) preds_list = [] labels_list = [] for step in range(self.dataset.num_samples): preds_np, labels_np = sess.run([preds_test, labels_test]) preds_list.append(preds_np) labels_list.append(labels_np) if step % (self.dataset.num_samples // 10) == 0: acc = np.mean(np.concatenate(preds_list, 0) == labels_list) print('Evaluation step {}/{} Mini-Batch Acc: {}'.format( step, self.dataset.num_samples, acc)) print('Len preds: {}'.format(len(labels_list))) acc = np.mean(np.concatenate(preds_list, 0) == labels_list) print('Accuracy: {}'.format(acc)) print('preds: {}'.format(np.concatenate(preds_list, 0).shape)) return [acc]
def train_model(self, chpt_path): print('Restoring from: {}'.format(chpt_path)) g = tf.Graph() with g.as_default(): with tf.device('/cpu:0'): # Init global step self.global_step = slim.create_global_step() batch_queue = self.get_data_queue() opt = self.optimizer() # Calculate the gradients for each model tower. tower_grads = [] loss = None layers = None with tf.variable_scope(tf.get_variable_scope()): for i in range(self.num_gpus): with tf.device('/gpu:%d' % i): with tf.name_scope('tower_{}'.format(i)) as scope: loss, grads, layers = self.build_model( batch_queue, i, opt, scope) tower_grads.append(grads) grad = self.average_gradients(tower_grads) # Make summaries self.make_summaries(grad, layers) # Apply the gradients to adjust the shared variables. apply_gradient_op = opt.apply_gradients( grad, global_step=self.global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( self.moving_avgs_decay, self.global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. apply_gradient_op = tf.group(apply_gradient_op, variables_averages_op) train_op = control_flow_ops.with_dependencies( [apply_gradient_op], loss) # Create a saver. saver = tf.train.Saver(tf.global_variables()) init_fn = self.make_init_fn(chpt_path) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(self.summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), graph=g) sess.run(init) prev_ckpt = get_checkpoint_path(self.get_save_dir()) if prev_ckpt: print('Restoring from previous checkpoint: {}'.format( prev_ckpt)) saver.restore(sess, prev_ckpt) elif init_fn: init_fn(sess) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(self.get_save_dir(), sess.graph) init_step = sess.run(self.global_step) print('Start training at step: {}'.format(init_step)) for step in range(init_step, self.num_train_steps): if self.debug: sess = tf_debug.LocalCLIDebugWrapperSession(sess) sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) start_time = time.time() _, loss_value = sess.run([train_op, loss]) duration = time.time() - start_time assert not np.isnan( loss_value), 'Model diverged with loss = NaN' if step % 50 == 0: num_examples_per_step = self.model.batch_size * self.num_gpus examples_per_sec = num_examples_per_step / duration sec_per_batch = duration / self.num_gpus print( '{}: step {}/{}, loss = {} ({} examples/sec; {} sec/batch)' .format(datetime.now(), step, self.num_train_steps, loss_value, examples_per_sec, sec_per_batch)) sys.stdout.flush() if step % 500 == 0: print('Writing summaries...') summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % 5000 == 0 or (step + 1) == self.num_train_steps: checkpoint_path = os.path.join(self.get_save_dir(), 'model.ckpt') print( 'Saving checkpoint to: {}'.format(checkpoint_path)) saver.save(sess, checkpoint_path, global_step=step)
from Preprocessor import Preprocessor from train.SDNetTrainer import SDNetTrainer from datasets.STL10 import STL10 from models.SDNet import SDNet from utils import get_checkpoint_path target_shape = [96, 96, 3] for fold in range(10): model = SDNet(num_layers=4, batch_size=200, target_shape=target_shape, pool5=False) data = STL10() preprocessor = Preprocessor(target_shape=target_shape) trainer = SDNetTrainer(model=model, dataset=data, pre_processor=preprocessor, num_epochs=400, tag='baseline', lr_policy='linear', optimizer='adam') chpt_path = get_checkpoint_path(trainer.get_save_dir()) trainer.finetune_cv(chpt_path, num_conv2train=5, num_conv2init=5, fold=fold)
def train_model(self, chpt_path): print('Restoring from: {}'.format(chpt_path)) g = tf.Graph() with g.as_default(): with tf.device('/cpu:0'): tf.random.set_random_seed(123) # Init global step self.global_step = tf.train.create_global_step() batch_queue = self.get_train_data_queue() opt_d = self.optimizer(self.opt_d) opt_g = self.optimizer(self.opt_g) # Calculate the gradients for each model tower. with tf.variable_scope(tf.get_variable_scope()): with tf.device('/gpu:%d' % 0): with tf.name_scope('gen') as scope: loss_g, grad_g, layers_g = self.build_generator(batch_queue, opt_g, scope) with tf.name_scope('disc') as scope: loss_d, grad_d, layers_d = self.build_discriminator(batch_queue, opt_d, scope) # Make summaries self.make_summaries(grad_d + grad_g, layers_d) # Apply the gradients to adjust the shared variables. apply_gradient_op_d = opt_d.apply_gradients(grad_d, global_step=self.global_step) apply_gradient_op_g = opt_g.apply_gradients(grad_g, global_step=self.global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage(self.moving_avgs_decay, self.global_step) variables_averages_op = variable_averages.apply(tf.trainable_variables()) # Group all updates to into a single train op. apply_gradient_op_d = tf.group(apply_gradient_op_d, variables_averages_op) apply_gradient_op_g = tf.group(apply_gradient_op_g, variables_averages_op) train_op_d = control_flow_ops.with_dependencies([apply_gradient_op_d], loss_d) train_op_g = control_flow_ops.with_dependencies([apply_gradient_op_g], loss_g) # Create a saver. saver = tf.train.Saver(tf.global_variables()) init_fn = self.make_init_fn(chpt_path) # Build the summary operation from the last tower summaries. summary_op = tf.summary.merge(self.summaries) # Build an initialization operation to run below. init = tf.global_variables_initializer() # Start running operations on the Graph. sess = tf.Session(config=tf.ConfigProto( allow_soft_placement=True, log_device_placement=False), graph=g) sess.run(init) prev_ckpt = get_checkpoint_path(self.get_save_dir()) if prev_ckpt: print('Restoring from previous checkpoint: {}'.format(prev_ckpt)) saver.restore(sess, prev_ckpt) elif init_fn: init_fn(sess) # Start the queue runners. tf.train.start_queue_runners(sess=sess) summary_writer = tf.summary.FileWriter(self.get_save_dir(), sess.graph) init_step = sess.run(self.global_step) init_step /= (1 + self.n_disc_steps) print('Start training at step: {}'.format(init_step)) for step in range(init_step, self.num_train_steps): start_time = time.time() for i in range(self.n_disc_steps): _, loss_value = sess.run([train_op_d, loss_d]) _, loss_value = sess.run([train_op_g, loss_g]) duration = time.time() - start_time assert not np.isnan(loss_value), 'Model diverged with loss = NaN' if step % (self.num_train_steps / 2000) == 0: num_examples_per_step = self.model.batch_size examples_per_sec = num_examples_per_step / duration sec_per_batch = duration print('{}: step {}/{}, loss = {} ({} examples/sec; {} sec/batch)' .format(datetime.now(), step, self.num_train_steps, loss_value, examples_per_sec, sec_per_batch)) sys.stdout.flush() if step % (self.num_train_steps / 200) == 0: print('Writing summaries...') summary_str = sess.run(summary_op) summary_writer.add_summary(summary_str, step) # Save the model checkpoint periodically. if step % (self.num_train_steps / 40) == 0 or (step + 1) == self.num_train_steps: checkpoint_path = os.path.join(self.get_save_dir(), 'model.ckpt') print('Saving checkpoint to: {}'.format(checkpoint_path)) saver.save(sess, checkpoint_path, global_step=step)