def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="DQN"): with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) # Create the replay buffer if self.prioritized_replay: self.replay_buffer = PrioritizedReplayBuffer(self.buffer_size, alpha=self.prioritized_replay_alpha) if self.prioritized_replay_beta_iters is None: prioritized_replay_beta_iters = total_timesteps self.beta_schedule = LinearSchedule(prioritized_replay_beta_iters, initial_p=self.prioritized_replay_beta0, final_p=1.0) else: self.replay_buffer = ReplayBuffer(self.buffer_size, hindsight=self.hindsight) self.beta_schedule = None # Create the schedule for exploration starting from 1. self.exploration = LinearSchedule(schedule_timesteps=int(self.exploration_fraction * total_timesteps), initial_p=1.0, final_p=self.exploration_final_eps) episode_rewards = [0.0] episode_trans = [] episode_replays = [] episode_success = [0] * 100 full_obs = self.env.reset() part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal'])) reset = True self.episode_reward = np.zeros((1,)) for step in range(total_timesteps): # Take action and update exploration to the newest value kwargs = {} if not self.param_noise: update_eps = self.exploration.value(step) update_param_noise_threshold = 0. else: update_eps = 0. # Compute the threshold such that the KL divergence between perturbed and non-perturbed # policy is comparable to eps-greedy exploration with eps = exploration.value(t). # See Appendix C.1 in Parameter Space Noise for Exploration, Plappert et al., 2017 # for detailed explanation. update_param_noise_threshold = \ -np.log(1. - self.exploration.value(step) + self.exploration.value(step) / float(self.env.action_space.n)) kwargs['reset'] = reset kwargs['update_param_noise_threshold'] = update_param_noise_threshold kwargs['update_param_noise_scale'] = True with self.sess.as_default(): action = self.act(np.array(part_obs)[None], update_eps=update_eps, **kwargs)[0] env_action = action reset = False new_obs, rew, done, _ = self.env.step(env_action) # Store transition in the replay buffer. # self.replay_buffer.add(part_obs, action, rew, np.concatenate((new_obs['observation'], new_obs['desired_goal'])), float(done)) episode_replays.append((full_obs, action, rew, new_obs, float(done))) episode_trans.append((full_obs, action, rew, new_obs)) full_obs = new_obs part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal'])) if writer is not None: ep_rew = np.array([rew]).reshape((1, -1)) ep_done = np.array([done]).reshape((1, -1)) self.episode_reward = total_episode_reward_logger(self.episode_reward, ep_rew, ep_done, writer, step) episode_rewards[-1] += rew if done: if np.array_equal(full_obs['achieved_goal'], full_obs['desired_goal']): episode_success.append(1.) else: episode_success.append(0.) episode_success = episode_success[1:] if not isinstance(self.env, VecEnv): full_obs = self.env.reset() part_obs = np.concatenate((full_obs['observation'], full_obs['desired_goal'])) self.replay_buffer.add(episode_replays) if callback is not None: callback(locals(), globals()) episode_rewards.append(0.0) episode_trans = [] episode_replays = [] reset = True if step > self.learning_starts and step % self.train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. if self.prioritized_replay: experience = self.replay_buffer.sample(self.batch_size, beta=self.beta_schedule.value(step)) (obses_t, actions, rewards, obses_tp1, dones, weights, batch_idxes) = experience else: obses_t, actions, rewards, obses_tp1, dones = self.replay_buffer.sample(self.batch_size) weights, batch_idxes = np.ones_like(rewards), None if writer is not None: # run loss backprop with summary, but once every 100 steps save the metadata # (memory, compute time, ...) if (1 + step) % 100 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % step) else: summary, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) writer.add_summary(summary, step) else: _, td_errors = self._train_step(obses_t, actions, rewards, obses_tp1, obses_tp1, dones, weights, sess=self.sess) if self.prioritized_replay: new_priorities = np.abs(td_errors) + self.prioritized_replay_eps self.replay_buffer.update_priorities(batch_idxes, new_priorities) # Metric obses_beg, obses_step, obses_fin, dist = self.replay_buffer.mtr_sample(self.batch_size) self.mtr_train(obses_beg, obses_step, obses_fin, dist) if step > self.learning_starts and step % self.target_network_update_freq == 0: # Update target network periodically. self.update_target(sess=self.sess) if len(episode_rewards[-101:-1]) == 0: mean_100ep_reward = -np.inf else: mean_100ep_reward = round(float(np.mean(episode_rewards[-101:-1])), 1) num_episodes = len(episode_rewards) if self.verbose >= 1 and done and log_interval is not None and len(episode_rewards) % log_interval == 0: logger.record_tabular("steps", step) logger.record_tabular("episodes", num_episodes) logger.record_tabular("mean 100 episode reward", mean_100ep_reward) logger.record_tabular("100 episode success", np.mean(episode_success)) logger.record_tabular("% time spent exploring", int(100 * self.exploration.value(step))) logger.dump_tabular() return self
config_proto.graph_options.rewrite_options.dependency_optimization = ( rewriter_config_pb2.RewriterConfig.OFF) config_proto.graph_options.rewrite_options.layout_optimizer = ( rewriter_config_pb2.RewriterConfig.OFF) sess = tf.Session(config=config_proto) sess.run(tf.global_variables_initializer()) X_, Y_ = sess.run([X, Y]) X_Y_ = X_ + Y_ _X_Y = _X + _Y tot_time = 0 for i in range(10): print(i) run_metadata = tf.RunMetadata() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE, output_partition_graphs=True) st = time.time() sess.run(Z1 + Z2 + Z3, {_i: i_ for _i, i_ in zip(_X_Y, X_Y_)}, options=run_options, run_metadata=run_metadata) tot_time += time.time() - st if i >= 2: jsonObj = MessageToJson(run_metadata) with open('%s/metadata_%d.json' % (logPath, i), 'w') as outfile: json.dump(jsonObj, outfile) trace = timeline.Timeline(step_stats=run_metadata.step_stats)
def train(self, rnn_config, l_data_config, train_config, info_config, run): self.rnn_config = rnn_config self.info_config = info_config self.train_config = train_config set_rnn_config(rnn_config) set_info_config(info_config) self.timer = Timer(info_config['timer']['enabled']) print_config(rnn_config, train_config, l_data_config) temp_model_path = '../models/temp' + info_config[ 'filename'] + '_' + str(train_config['task_id']) pretrained_model_path = '../tr_models/' + str( train_config['pretraining']['path']) if train_config['mode']['name'] == 'inc_lengths': n_sessions = len(train_config['mode']['in_seq_len']) elif train_config['mode']['name'] == 'classic': n_sessions = 1 else: raise Exception('training mode not understood') self.timer.start() set_train_config(train_config) # Sessions refer to training with different architectures. If one RNN is used throughout the training process # then only one session is created. Training with incremental sequence lengths for example requires multiple # RNNs, one for each sequence lenghts. Evaluation datasets (validation and test) are always evaluated on a fixed # RNN, only the RNN structure used for the training set varies. current_epoch stores the total amounts of epochs # and epoch the epoch within a session current_epoch = 0 tau = self.train_config['tau'] learning_rate = self.train_config['learning_rate'] best_weight_probs_dict = None for session_idx in range(n_sessions): tf.reset_default_graph() if self.train_config['mode']['name'] == 'inc_lengths': max_epochs = self.train_config['mode']['max_epochs'][ session_idx] min_error = self.train_config['mode']['min_errors'][ session_idx] self.create_modificated_model(l_data_config, session_idx) elif self.train_config['mode']['name'] == 'classic': self.data_dict = load_dataset(l_data_config) l_data = LabeledData(l_data_config, self.data_dict) self.create_rnn(l_data, l_data_config) max_epochs = self.train_config['mode']['max_epochs'] min_error = self.train_config['mode']['min_error'] self.timer.restart('Graph creation') # Saver is used for restoring weights for new session if more than one is used for training model_saver = tf.train.Saver(var_list=tf.trainable_variables()) with tf.Session() as sess: if info_config['profiling']['enabled']: options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) else: options = tf.RunOptions(trace_level=tf.RunOptions.NO_TRACE) run_metadata = tf.RunMetadata() writer = tf.summary.FileWriter( info_config['tensorboard']['path'] + str(self.train_config['task_id'])) sess.run(tf.global_variables_initializer()) if session_idx != 0: #self.optimistic_restore(sess, pretrained_model_path) model_saver.restore(sess, temp_model_path) elif self.train_config['pretraining']['enabled'] == True: self.optimistic_restore(sess, pretrained_model_path) sess.run(self.rnn.init_op) #sess = tf_debug.LocalCLIDebugWrapperSession(sess, ui_type="readline") #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) self.timer.restart('Intialization') # Loading datasets into GPU (via tf.Variables) for key in self.data_dict.keys(): sess.run(self.l_data.data[key]['load'], feed_dict={ self.l_data.data[key]['x_ph']: self.data_dict[key]['x'], self.l_data.data[key]['y_ph']: self.data_dict[key]['y'] }) self.timer.restart('Loading data') traces = list() for epoch in range(max_epochs): if self.info_config['gradient']['evaluate']: self.save_gradient_variance(sess, epoch, tau) quit() # Evaluate performance on the different datasets and print some results on console # Also check potential stopping critera if current_epoch % info_config[ 'calc_performance_every'] == 0: self.rnn.t_metrics.retrieve_results( sess, current_epoch, tau) self.rnn.t_metrics.print(session_idx) #if self.rnn.t_metrics.result_dict['tr_b']['vfe'][-1] < min_error: #break if current_epoch + 1 % info_config['save_weights'][ 'save_every'] == 0: self.save_weight_probs( info_config['save_weights']['path'], current_epoch, run, sess.run(self.rnn.get_weights_op)) if info_config['save_weights']['save_best']: if self.rnn.t_metrics.best_va['is_current']: best_weight_probs_dict = sess.run( self.rnn.get_weights_op) self.timer.restart('Metrics') # Optionally store tensorboard summaries if info_config['tensorboard']['enabled'] \ and current_epoch % info_config['tensorboard']['period'] == 0: if info_config['tensorboard']['weights']: weight_summary = sess.run( self.rnn.weight_summaries, feed_dict={ self.rnn.tau: (tau, ), self.l_data.batch_idx: 0, self.rnn.is_training: False }) writer.add_summary(weight_summary, current_epoch) if info_config['tensorboard']['gradients']: gradient_summary = sess.run( self.rnn.gradient_summaries, feed_dict={ self.rnn.tau: (tau, ), self.l_data.batch_idx: 0, self.rnn.is_training: False }) writer.add_summary(gradient_summary, current_epoch) if info_config['tensorboard']['results']: t_result_summaries = sess.run( self.rnn.t_metric_summaries, feed_dict={ self.rnn.tau: (tau, ), self.l_data.batch_idx: 0, self.rnn.is_training: False }) writer.add_summary(t_result_summaries, current_epoch) if info_config['tensorboard']['acts']: act_summaries = sess.run(self.rnn.act_summaries, feed_dict={ self.rnn.tau: (tau, ), self.l_data.batch_idx: 0, self.rnn.is_training: False }) writer.add_summary(act_summaries, current_epoch) self.timer.restart('Tensorboard') # Train for one full epoch. First shuffle to create new minibatches from the given data and # then do a training step for each minibatch. # Also anneal learning rate and tau if necessary if (current_epoch + 1) % self.train_config['learning_rate_tau'] == 0: learning_rate /= 2 sess.run(self.l_data.data['tr']['shuffle']) if 'c_ar' in self.train_config[ 'algorithm'] or 'c_arm' in self.train_config[ 'algorithm']: sess.run( self.rnn.assign_learning_rate, feed_dict={self.rnn.learning_rate: learning_rate}) for minibatch_idx in range( self.l_data.data['tr']['n_minibatches']): if 'c_ar' in self.train_config['algorithm'] or 'c_arm' in self.train_config['algorithm']\ or 'log_der' in self.train_config['algorithm']: grads = [] for i in range( self.train_config['carm_iterations']): sess.run(self.rnn.c_arm_sample_op) gradients = sess.run(self.rnn.gradients, feed_dict={ self.l_data.batch_idx: minibatch_idx, self.rnn.is_training: True }) if len(grads) == 0: for j in range(len(gradients)): grads.append(gradients[j][0]) else: for j in range(len(grads)): if grads[j] is not None: grads[j] += gradients[j][0] for j in range(len(grads)): grads[j] /= self.train_config[ 'carm_iterations'] sess.run(self.rnn.train_b_op, feed_dict={ gradient_ph: grad for gradient_ph, grad in zip( self.rnn.gradient_ph, grads) }) else: sess.run(self.rnn.train_b_op, feed_dict={ self.rnn.learning_rate: learning_rate, self.rnn.tau: (tau, ), self.l_data.batch_idx: minibatch_idx, self.rnn.is_training: True }, options=options, run_metadata=run_metadata) if info_config['profiling']['enabled']: traces.append( timeline.Timeline(run_metadata.step_stats). generate_chrome_trace_format()) current_epoch += 1 self.timer.restart('Training') # Optionally store profiling results of this epoch in files if info_config['profiling']['enabled']: for trace_idx, trace in enumerate(traces): path = info_config['profiling']['path'] + '_' + str( current_epoch) + '_' + str(trace_idx) with open(path + 'training.json', 'w') as f: f.write(trace) # TODO: Clean the cell access code if info_config['cell_access']: ca_1, ca_2 = sess.run([ self.rnn.layers[0].cell_access_mat, self.rnn.layers[1].cell_access_mat ], feed_dict={self.l_data.batch_idx: 0}) np.save(file='../nr/ca_1_' + str(self.train_config['task_id']), arr=ca_1) np.save(file='../nr/ca_2_' + str(self.train_config['task_id']), arr=ca_2) model_saver.save(sess, temp_model_path) if info_config['save_weights']['save_best']: self.save_weight_probs(self.info_config['save_weights']['path'], 'best', run, best_weight_probs_dict) writer.close() return self.rnn.t_metrics.result_dict
def main(): os.environ['CUDA_VISIBLE_DEVICES'] = '4' if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test": if a.checkpoint is None: raise Exception("checkpoint required for test mode") for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) examples = load_examples() model = create_model(examples.inputs1, examples.inputs2, examples.inputs3, examples.inputs4, examples.targets, examples.inputs5 ) with tf.name_scope("images"): display_fetches = { "targets": examples.targets, "outputs": model.outputs, } with tf.name_scope("inputs1_summary"): tf.summary.image("inputs1", examples.inputs1) with tf.name_scope("inputs2_summary"): tf.summary.image("inputs2", examples.inputs2) tf.summary.scalar("DUnet_loss_L1", model.DUnet_loss_L1) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: print("parameter_count = ", sess.run(parameter_count)) if a.checkpoint is not None: print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) max_steps = 2 ** 32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": max_steps = int(a.test_count / a.batch_size) for i in range(max_steps): results = sess.run(display_fetches) print(results["outputs"].shape) save_images(results, i) else: start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["DUnet_loss_L1"] = model.DUnet_loss_L1 if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") save_images(results["display"], step=results["global_step"]) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print("progress epoch %d step %d image/sec %0.1f remaining %dm" % ( train_epoch, train_step, rate, remaining / 60)) print("DUnet_loss_L1", results["DUnet_loss_L1"]) if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="PPO1"): with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name) as writer: self._setup_learn(seed) assert issubclass(self.policy, ActorCriticPolicy), "Error: the input policy for the PPO1 model must be " \ "an instance of common.policies.ActorCriticPolicy." with self.sess.as_default(): self.adam.sync() # Prepare for rollouts seg_gen = traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch) # seg_gen = filtered_traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, imbalance_limit = self.timesteps_per_actorbatch // 100, waste_limit=self.timesteps_per_actorbatch*10) # seg_gen = balanced_traj_segment_generator(self.policy_pi, self.env, self.timesteps_per_actorbatch, waste_limit=self.timesteps_per_actorbatch*10) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() # rolling buffer for episode lengths lenbuffer = deque(maxlen=100) # rolling buffer for episode rewards rewbuffer = deque(maxlen=100) self.episode_reward = np.zeros((self.n_envs,)) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) == False: break if total_timesteps and timesteps_so_far >= total_timesteps: break if self.schedule == 'constant': cur_lrmult = 1.0 elif self.schedule == 'linear': cur_lrmult = max(1.0 - float(timesteps_so_far) / total_timesteps, 0) else: raise NotImplementedError logger.log("********** Iteration %i ************" % iters_so_far) # logger.record_tabular("update_no", iters_so_far) logger.logkv("update_no", iters_so_far) seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # seg = balanced_sample(seg_gen, self.timesteps_per_actorbatch, self.gamma, self.lam, waste_limit=self.timesteps_per_actorbatch*10) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) obs_ph, action_ph, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg["tdlamret"] # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger(self.episode_reward, seg["true_rew"].reshape((self.n_envs, -1)), seg["dones"].reshape((self.n_envs, -1)), writer, timesteps_so_far) # predicted value function before udpate vpredbefore = seg["vpred"] # standardized advantage function estimate atarg = (atarg - atarg.mean()) / atarg.std() dataset = Dataset(dict(ob=obs_ph, ac=action_ph, atarg=atarg, vtarg=tdlamret), shuffle=not issubclass(self.policy, LstmPolicy)) optim_batchsize = self.optim_batchsize or obs_ph.shape[0] # set old parameter values to new parameter values self.assign_old_eq_new(sess=self.sess) logger.log("Optimizing...") logger.log(fmt_row(13, self.loss_names)) # Here we do a bunch of optimization epochs over the data for k in range(self.optim_epochs): # list of tuples, each of which gives the loss for a minibatch losses = [] for i, batch in enumerate(dataset.iterate_once(optim_batchsize)): steps = (timesteps_so_far + k * optim_batchsize + int(i * (optim_batchsize / len(dataset.data_map)))) if writer is not None: # run loss backprop with summary, but once every 10 runs save the metadata # (memory, compute time, ...) if (1 + k) % 10 == 0: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess, options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, 'step%d' % steps) else: summary, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) writer.add_summary(summary, steps) else: _, grad, *newlosses = self.lossandgrad(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) self.adam.update(grad, self.optim_stepsize * cur_lrmult) losses.append(newlosses) logger.log(fmt_row(13, np.mean(losses, axis=0))) logger.log("Evaluating losses...") losses = [] for batch in dataset.iterate_once(optim_batchsize): newlosses = self.compute_losses(batch["ob"], batch["ob"], batch["ac"], batch["atarg"], batch["vtarg"], cur_lrmult, sess=self.sess) losses.append(newlosses) mean_losses, _, _ = mpi_moments(losses, axis=0) logger.log(fmt_row(13, mean_losses)) for (loss_val, name) in zipsame(mean_losses, self.loss_names): logger.record_tabular("loss_" + name, loss_val) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) # local values lrlocal = (seg["ep_lens"], seg["ep_rets"]) # list of tuples listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += MPI.COMM_WORLD.allreduce(seg["total_timestep"]) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and MPI.COMM_WORLD.Get_rank() == 0: logger.dump_tabular() return self
def train_model(self, sess, max_iters): """Network training loop.""" data_layer = get_data_layer(self.roidb, self.imdb.num_classes) # RPN # classification loss rpn_cls_score = tf.reshape( self.net.get_output('rpn_cls_score_reshape'), [-1, 2]) rpn_label = tf.reshape(self.net.get_output('rpn-data')[0], [-1]) rpn_cls_score = tf.reshape( tf.gather(rpn_cls_score, tf.where(tf.not_equal(rpn_label, -1))), [-1, 2]) rpn_label = tf.reshape( tf.gather(rpn_label, tf.where(tf.not_equal(rpn_label, -1))), [-1]) rpn_cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=rpn_cls_score, labels=rpn_label)) # bounding box regression L1 loss rpn_bbox_pred = self.net.get_output('rpn_bbox_pred') rpn_bbox_targets = tf.transpose( self.net.get_output('rpn-data')[1], [0, 2, 3, 1]) rpn_bbox_inside_weights = tf.transpose( self.net.get_output('rpn-data')[2], [0, 2, 3, 1]) rpn_bbox_outside_weights = tf.transpose( self.net.get_output('rpn-data')[3], [0, 2, 3, 1]) rpn_smooth_l1 = self._modified_smooth_l1(3.0, rpn_bbox_pred, rpn_bbox_targets, rpn_bbox_inside_weights, rpn_bbox_outside_weights) rpn_loss_box = tf.reduce_mean( tf.reduce_sum(rpn_smooth_l1, reduction_indices=[1, 2, 3])) # R-CNN # classification loss cls_score = self.net.get_output('cls_score') label = tf.reshape(self.net.get_output('roi-data')[1], [-1]) cross_entropy = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits(logits=cls_score, labels=label)) # bounding box regression L1 loss bbox_pred = self.net.get_output('bbox_pred') bbox_targets = self.net.get_output('roi-data')[2] bbox_inside_weights = self.net.get_output('roi-data')[3] bbox_outside_weights = self.net.get_output('roi-data')[4] smooth_l1 = self._modified_smooth_l1(1.0, bbox_pred, bbox_targets, bbox_inside_weights, bbox_outside_weights) loss_box = tf.reduce_mean( tf.reduce_sum(smooth_l1, reduction_indices=[1])) # final loss loss = cross_entropy + loss_box + rpn_cross_entropy + rpn_loss_box # optimizer and learning rate global_step = tf.Variable(0, trainable=False) lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE, global_step, cfg.TRAIN.STEPSIZE, 0.1, staircase=True) momentum = cfg.TRAIN.MOMENTUM train_op = tf.train.MomentumOptimizer(lr, momentum).minimize( loss, global_step=global_step) # iintialize variables sess.run(tf.global_variables_initializer()) if self.pretrained_model is not None: print(('Loading pretrained model ' 'weights from {:s}').format(self.pretrained_model)) self.net.load(self.pretrained_model, sess, self.saver, True) last_snapshot_iter = -1 timer = Timer() for iter in range(max_iters): # get one batch blobs = data_layer.forward() # Make one SGD update feed_dict={self.net.data: blobs['data'], self.net.im_info: blobs['im_info'], self.net.keep_prob: 0.5, \ self.net.gt_boxes: blobs['gt_boxes']} run_options = None run_metadata = None if cfg.TRAIN.DEBUG_TIMELINE: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() timer.tic() rpn_loss_cls_value, rpn_loss_box_value, loss_cls_value, loss_box_value, _ = sess.run( [ rpn_cross_entropy, rpn_loss_box, cross_entropy, loss_box, train_op ], feed_dict=feed_dict, options=run_options, run_metadata=run_metadata) timer.toc() if cfg.TRAIN.DEBUG_TIMELINE: trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file = open( str(long(time.time() * 1000)) + '-train-timeline.ctf.json', 'w') trace_file.write( trace.generate_chrome_trace_format(show_memory=False)) trace_file.close() if (iter + 1) % (cfg.TRAIN.DISPLAY) == 0: print('iter: %d / %d, total loss: %.4f, rpn_loss_cls: %.4f, rpn_loss_box: %.4f, loss_cls: %.4f, loss_box: %.4f, lr: %f'%\ (iter+1, max_iters, rpn_loss_cls_value + rpn_loss_box_value + loss_cls_value + loss_box_value ,rpn_loss_cls_value, rpn_loss_box_value,loss_cls_value, loss_box_value, lr.eval())) print('speed: {:.3f}s / iter'.format(timer.average_time)) if (iter + 1) % cfg.TRAIN.SNAPSHOT_ITERS == 0: last_snapshot_iter = iter self.snapshot(sess, iter) if last_snapshot_iter != iter: self.snapshot(sess, iter)
def word2vec_basic(log_dir): """Example of building, training and visualizing a word2vec model.""" # Create the directory for TensorBoard variables if there is not. if not os.path.exists(log_dir): os.makedirs(log_dir) # Step 1: Download the data. url = 'http://mattmahoney.net/dc/' # pylint: disable=redefined-outer-name def maybe_download(filename, expected_bytes): """Download a file if not present, and make sure it's the right size.""" local_filename = os.path.join(gettempdir(), filename) if not os.path.exists(local_filename): local_filename, _ = urllib.request.urlretrieve( url + filename, local_filename) statinfo = os.stat(local_filename) if statinfo.st_size == expected_bytes: print('Found and verified', filename) else: print(statinfo.st_size) raise Exception('Failed to verify ' + local_filename + '. Can you get to it with a browser?') return local_filename filename = maybe_download('text8.zip', 31344016) # Read the data into a list of strings. def read_data(filename): """Extract the first file enclosed in a zip file as a list of words.""" with zipfile.ZipFile(filename) as f: data = tf.compat.as_str(f.read(f.namelist()[0])).split() return data vocabulary = read_data(filename) print('Data size', len(vocabulary)) # Step 2: Build the dictionary and replace rare words with UNK token. vocabulary_size = 50000 def build_dataset(words, n_words): """Process raw inputs into a dataset.""" count = [['UNK', -1]] count.extend(collections.Counter(words).most_common(n_words - 1)) dictionary = {} for word, _ in count: dictionary[word] = len(dictionary) data = [] unk_count = 0 for word in words: index = dictionary.get(word, 0) if index == 0: # dictionary['UNK'] unk_count += 1 data.append(index) count[0][1] = unk_count reversed_dictionary = dict(zip(dictionary.values(), dictionary.keys())) return data, count, dictionary, reversed_dictionary # Filling 4 global variables: # data - list of codes (integers from 0 to vocabulary_size-1). # This is the original text but words are replaced by their codes # count - map of words(strings) to count of occurrences # dictionary - map of words(strings) to their codes(integers) # reverse_dictionary - maps codes(integers) to words(strings) data, count, unused_dictionary, reverse_dictionary = build_dataset( vocabulary, vocabulary_size) del vocabulary # Hint to reduce memory. print('Most common words (+UNK)', count[:5]) print('Sample data', data[:10], [reverse_dictionary[i] for i in data[:10]]) # Step 3: Function to generate a training batch for the skip-gram model. def generate_batch(batch_size, num_skips, skip_window): global data_index assert batch_size % num_skips == 0 assert num_skips <= 2 * skip_window batch = np.ndarray(shape=(batch_size), dtype=np.int32) labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32) span = 2 * skip_window + 1 # [ skip_window target skip_window ] buffer = collections.deque(maxlen=span) # pylint: disable=redefined-builtin if data_index + span > len(data): data_index = 0 buffer.extend(data[data_index:data_index + span]) data_index += span for i in range(batch_size // num_skips): context_words = [w for w in range(span) if w != skip_window] words_to_use = random.sample(context_words, num_skips) for j, context_word in enumerate(words_to_use): batch[i * num_skips + j] = buffer[skip_window] labels[i * num_skips + j, 0] = buffer[context_word] if data_index == len(data): buffer.extend(data[0:span]) data_index = span else: buffer.append(data[data_index]) data_index += 1 # Backtrack a little bit to avoid skipping words in the end of a batch data_index = (data_index + len(data) - span) % len(data) return batch, labels batch, labels = generate_batch(batch_size=8, num_skips=2, skip_window=1) for i in range(8): print(batch[i], reverse_dictionary[batch[i]], '->', labels[i, 0], reverse_dictionary[labels[i, 0]]) # Step 4: Build and train a skip-gram model. batch_size = 128 embedding_size = 128 # Dimension of the embedding vector. skip_window = 1 # How many words to consider left and right. num_skips = 2 # How many times to reuse an input to generate a label. num_sampled = 64 # Number of negative examples to sample. # We pick a random validation set to sample nearest neighbors. Here we limit # the validation samples to the words that have a low numeric ID, which by # construction are also the most frequent. These 3 variables are used only for # displaying model accuracy, they don't affect calculation. valid_size = 16 # Random set of words to evaluate similarity on. valid_window = 100 # Only pick dev samples in the head of the distribution. valid_examples = np.random.choice(valid_window, valid_size, replace=False) graph = tf.Graph() with graph.as_default(): # Input data. with tf.name_scope('inputs'): train_inputs = tf.placeholder(tf.int32, shape=[batch_size]) train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1]) valid_dataset = tf.constant(valid_examples, dtype=tf.int32) # Ops and variables pinned to the CPU because of missing GPU implementation with tf.device('/cpu:0'): # Look up embeddings for inputs. with tf.name_scope('embeddings'): embeddings = tf.Variable( tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0)) embed = tf.nn.embedding_lookup(embeddings, train_inputs) # Construct the variables for the NCE loss with tf.name_scope('weights'): nce_weights = tf.Variable( tf.truncated_normal([vocabulary_size, embedding_size], stddev=1.0 / math.sqrt(embedding_size))) with tf.name_scope('biases'): nce_biases = tf.Variable(tf.zeros([vocabulary_size])) # Compute the average NCE loss for the batch. # tf.nce_loss automatically draws a new sample of the negative labels each # time we evaluate the loss. # Explanation of the meaning of NCE loss: # http://mccormickml.com/2016/04/19/word2vec-tutorial-the-skip-gram-model/ with tf.name_scope('loss'): loss = tf.reduce_mean( tf.nn.nce_loss(weights=nce_weights, biases=nce_biases, labels=train_labels, inputs=embed, num_sampled=num_sampled, num_classes=vocabulary_size)) # Add the loss value as a scalar to summary. tf.summary.scalar('loss', loss) # Construct the SGD optimizer using a learning rate of 1.0. with tf.name_scope('optimizer'): optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss) # Compute the cosine similarity between minibatch examples and all # embeddings. norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keepdims=True)) normalized_embeddings = embeddings / norm valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset) similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True) # Merge all summaries. merged = tf.summary.merge_all() # Add variable initializer. init = tf.global_variables_initializer() # Create a saver. saver = tf.train.Saver() # Step 5: Begin training. num_steps = 100001 with tf.Session(graph=graph) as session: # Open a writer to write summaries. writer = tf.summary.FileWriter(log_dir, session.graph) # We must initialize all variables before we use them. init.run() print('Initialized') average_loss = 0 for step in xrange(num_steps): batch_inputs, batch_labels = generate_batch( batch_size, num_skips, skip_window) feed_dict = { train_inputs: batch_inputs, train_labels: batch_labels } # Define metadata variable. run_metadata = tf.RunMetadata() # We perform one update step by evaluating the optimizer op (including it # in the list of returned values for session.run() # Also, evaluate the merged op to get all summaries from the returned # "summary" variable. Feed metadata variable to session for visualizing # the graph in TensorBoard. _, summary, loss_val = session.run([optimizer, merged, loss], feed_dict=feed_dict, run_metadata=run_metadata) average_loss += loss_val # Add returned summaries to writer in each step. writer.add_summary(summary, step) # Add metadata to visualize the graph for the last run. if step == (num_steps - 1): writer.add_run_metadata(run_metadata, 'step%d' % step) if step % 2000 == 0: if step > 0: average_loss /= 2000 # The average loss is an estimate of the loss over the last 2000 # batches. print('Average loss at step ', step, ': ', average_loss) average_loss = 0 # Note that this is expensive (~20% slowdown if computed every 500 steps) if step % 10000 == 0: sim = similarity.eval() for i in xrange(valid_size): valid_word = reverse_dictionary[valid_examples[i]] top_k = 8 # number of nearest neighbors nearest = (-sim[i, :]).argsort()[1:top_k + 1] log_str = 'Nearest to %s:' % valid_word for k in xrange(top_k): close_word = reverse_dictionary[nearest[k]] log_str = '%s %s,' % (log_str, close_word) print(log_str) final_embeddings = normalized_embeddings.eval() # Write corresponding labels for the embeddings. with open(log_dir + '/metadata.tsv', 'w') as f: for i in xrange(vocabulary_size): f.write(reverse_dictionary[i] + '\n') # Save the model for checkpoints. saver.save(session, os.path.join(log_dir, 'model.ckpt')) # Create a configuration for visualizing embeddings with the labels in # TensorBoard. config = projector.ProjectorConfig() embedding_conf = config.embeddings.add() embedding_conf.tensor_name = embeddings.name embedding_conf.metadata_path = os.path.join(log_dir, 'metadata.tsv') projector.visualize_embeddings(writer, config) writer.close() # Step 6: Visualize the embeddings. # pylint: disable=missing-docstring # Function to draw visualization of distance between embeddings. def plot_with_labels(low_dim_embs, labels, filename): assert low_dim_embs.shape[0] >= len( labels), 'More labels than embeddings' plt.figure(figsize=(18, 18)) # in inches for i, label in enumerate(labels): x, y = low_dim_embs[i, :] plt.scatter(x, y) plt.annotate(label, xy=(x, y), xytext=(5, 2), textcoords='offset points', ha='right', va='bottom') plt.savefig(filename) try: # pylint: disable=g-import-not-at-top from sklearn.manifold import TSNE import matplotlib.pyplot as plt tsne = TSNE(perplexity=30, n_components=2, init='pca', n_iter=5000, method='exact') plot_only = 500 low_dim_embs = tsne.fit_transform(final_embeddings[:plot_only, :]) labels = [reverse_dictionary[i] for i in xrange(plot_only)] plot_with_labels(low_dim_embs, labels, os.path.join(gettempdir(), 'tsne.png')) except ImportError as ex: print( 'Please install sklearn, matplotlib, and scipy to show embeddings.' ) print(ex)
def run(self, sess): if not self.init: return train_data = data_parser(self.args) self.model.setup_training(sess) if self.args.lr_scheduler is not None: global_step = tf.Variable(0, trainable=False, dtype=tf.int64) if self.args.lr_scheduler is None: learning_rate = tf.constant(self.args.learning_rate, dtype=tf.float16) else: raise NotImplementedError( 'Learning rate scheduler type [%s] is not implemented', self.args.lr_scheduler) opt = tf.train.AdamOptimizer(learning_rate) trainG = opt.minimize(self.model.loss) # like hed saver = tf.train.Saver(max_to_keep=7) sess.run(tf.global_variables_initializer()) # here to recovery previous training if self.args.use_previous_trained: if self.args.dataset_name.lower( ) != 'biped': # using biped pretrained to use in other dataset model_path = os.path.join( self.args.checkpoint_dir, self.args.model_name + '_' + self.args.train_dataset, 'train') else: model_path = os.path.join( self.args.checkpoint_dir, self.args.model_name + '_' + self.args.train_dataset) model_path = os.path.join(model_path, 'train') if not os.path.exists(model_path) or len( os.listdir(model_path)) == 0: # : ini = 0 maxi = self.args.max_iterations + 1 print_warning( 'There is not previous trained data for the current model... and' ) print_warning( '*** The training process is starting from scratch ***') else: # restoring using the last checkpoint assert ( len(os.listdir(model_path)) != 0 ), 'There is not previous trained data for the current model...' last_ckpt = tf.train.latest_checkpoint(model_path) saver.restore(sess, last_ckpt) ini = self.args.max_iterations maxi = ini + self.args.max_iterations + 1 # check print_info( '--> Previous model restored successfully: {}'.format( last_ckpt)) else: print_warning( '*** The training process is starting from scratch ***') ini = 0 maxi = ini + self.args.max_iterations prev_loss = 1000. prev_val = None # directories for checkpoints checkpoint_dir = os.path.join( self.args.checkpoint_dir, self.args.model_name + '_' + self.args.train_dataset, self.args.model_state) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) fig = plt.figure() for idx in range(ini, maxi): x_batch, y_batch, _ = get_training_batch(self.args, train_data) run_metadata = tf.RunMetadata() _, summary, loss, pred_maps = sess.run([ trainG, self.model.merged_summary, self.model.loss, self.model.predictions ], feed_dict={ self.model.images: x_batch, self.model.edgemaps: y_batch }) if idx % 5 == 0: self.model.train_writer.add_run_metadata( run_metadata, 'step{:06}'.format(idx)) self.model.train_writer.add_summary(summary, idx) print(time.ctime(), '[{}/{}]'.format(idx, maxi), ' TRAINING loss: %.5f' % loss, 'prev_loss: %.5f' % prev_loss) # saving trained parameters save_inter = ini + self.args.save_interval if prev_loss > loss: saver.save(sess, os.path.join(checkpoint_dir, self.args.model_name), global_step=idx) prev_loss = loss print("Weights saved in the lowest loss", idx, " Current Loss", prev_loss) if idx % self.args.save_interval == 0: saver.save(sess, os.path.join(checkpoint_dir, self.args.model_name), global_step=idx) prev_loss = loss print("Weights saved in the interval", idx, " Current Loss", prev_loss) # ********* for validation ********** if (idx + 1) % self.args.val_interval == 0: pause_show = 0.01 imgs_list = [] img = x_batch[2][:, :, 0:3] gt_mp = y_batch[2] imgs_list.append(img) imgs_list.append(gt_mp) for i in range(len(pred_maps)): tmp = pred_maps[i][2, ...] imgs_list.append(tmp) vis_imgs = visualize_result(imgs_list, self.args) fig.suptitle("Iterac:" + str(idx + 1) + " Loss:" + '%.5f' % loss + " training") fig.add_subplot(1, 1, 1) plt.imshow(np.uint8(vis_imgs)) print("Evaluation in progress...") plt.draw() plt.pause(pause_show) im, em, _ = get_validation_batch(self.args, train_data) summary, error, pred_val = sess.run([ self.model.merged_summary, self.model.error, self.model.fuse_output ], feed_dict={ self.model.images: im, self.model.edgemaps: em }) if error <= 0.08: saver.save(sess, os.path.join(checkpoint_dir, self.args.model_name), global_step=idx) prev_loss = loss print( "Parameters saved in the validation stage when its error is <=0.08::", error) self.model.val_writer.add_summary(summary, idx) print_info(('[{}/{}]'.format(idx, self.args.max_iterations), 'VALIDATION error: %0.5f' % error, 'pError: %.5f' % prev_loss)) if (idx + 1) % (self.args.val_interval * 150) == 0: print('updating visualisation') plt.close() fig = plt.figure() saver.save(sess, os.path.join(checkpoint_dir, self.args.model_name), global_step=idx) print("Final Weights saved", idx, " Current Loss", loss) self.model.train_writer.close() sess.close()
def main(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) # Create the model x = tf.placeholder(tf.float32, [None, 784]) w = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(x, w) + b # Define loss and optimizer y_ = tf.placeholder(tf.float32, [None, 10]) # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.nn.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the raw # outputs of 'y', and then average across the batch. cross_entropy = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y)) train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) config = tf.ConfigProto() jit_level = 0 if FLAGS.xla: # Turns on XLA JIT compilation. jit_level = tf.OptimizerOptions.ON_1 config.graph_options.optimizer_options.global_jit_level = jit_level run_metadata = tf.RunMetadata() sess = tf.Session(config=config) tf.global_variables_initializer().run(session=sess) # Train train_loops = 1000 for i in range(train_loops): batch_xs, batch_ys = mnist.train.next_batch(100) # Create a timeline for the last loop and export to json to view with # chrome://tracing/. if i == train_loops - 1: sess.run( train_step, feed_dict={ x: batch_xs, y_: batch_ys }, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats) trace_file = open('timeline.ctf.json', 'w') trace_file.write(trace.generate_chrome_trace_format()) else: sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print( sess.run(accuracy, feed_dict={ x: mnist.test.images, y_: mnist.test.labels })) sess.close()
def model(X_train, Y_train, X_test, Y_test, learning_rate=0.003, num_epochs=50, minibatch_size=64, print_cost=True): ops.reset_default_graph( ) # to be able to rerun the model without overwriting tf variables tf.set_random_seed(1) # to keep results consistent (tensorflow seed) seed = 3 # to keep results consistent (numpy seed) (m, n_H0, n_W0, n_C0) = X_train.shape n_y = Y_train.shape[1] costs = [] # To keep track of the cost # Create Placeholders of the correct shape with tf.name_scope("input"): X, Y = create_placeholders(n_H0, n_W0, n_C0, n_y) #tf.summary.scalar('input_X', X) #tf.summary.scalar('input_Y', Y) # Initialize parameters with tf.name_scope('input_image'): image_in = tf.reshape(X, [-1, 64, 64, 3]) tf.summary.image('input_image', [image_in[5, :, :, :]], 5) parameters = initialize_parameters() # Forward propagation: Build the forward propagation in the tensorflow graph Z3 = forward_propagation(X, parameters) # Cost function: Add cost function to tensorflow graph cost = compute_cost(Z3, Y) # Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer that minimizes the cost. with tf.name_scope("optimizer"): optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize(cost) #tf.summary.scalar('Adam', optimizer) # Initialize all the variables globally init = tf.global_variables_initializer() merged = tf.summary.merge_all() writer = tf.summary.FileWriter("/anaconda3/dnn/con4", tf.get_default_graph()) # Start the session to compute the tensorflow graph with tf.Session() as sess: # Run the initialization sess.run(init) # Do the training loop for epoch in range(num_epochs): minibatch_cost = 0. num_minibatches = int( m / minibatch_size ) # number of minibatches of size minibatch_size in the train set seed = seed + 1 minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed) iiiii = 0 for minibatch in minibatches: # Select a minibatch (minibatch_X, minibatch_Y) = minibatch # IMPORTANT: The line that runs the graph on a minibatch. # Run the session to execute the optimizer and the cost, the feedict should contain a minibatch for (X,Y). iiiii += 1 if iiiii == 10: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() opti, temp_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y}, \ options=run_options, run_metadata=run_metadata) writer.add_run_metadata(run_metadata, str(epoch)) elif iiiii == 20: summary, _ = sess.run([merged, cost], feed_dict={ X: minibatch_X, Y: minibatch_Y }) writer.add_summary(summary, epoch) else: opti, temp_cost = sess.run([optimizer, cost], feed_dict={ X: minibatch_X, Y: minibatch_Y }) #tf.summary.scalar('opti', opti) tf.summary.scalar('temp_cost', temp_cost) minibatch_cost += temp_cost / num_minibatches # Print the cost every epoch if print_cost == True: #and epoch % 5 == 0: print("Cost after epoch %i: %f" % (epoch, minibatch_cost)) if print_cost == True: #and epoch % 1 == 0: costs.append(minibatch_cost) # plot the cost plt.plot(np.squeeze(costs)) plt.ylabel('cost') plt.xlabel('iterations (per tens)') #plt.title("Learning rate = 0.003" ) plt.title("Learning rate =" + str(learning_rate)) plt.show() # Calculate the correct predictions predict_op = tf.argmax(Z3, 1) correct_prediction = tf.equal(predict_op, tf.argmax(Y, 1)) # Calculate accuracy on the test set accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float")) #print(accuracy) train_accuracy = accuracy.eval({X: X_train, Y: Y_train}) test_accuracy = accuracy.eval({X: X_test, Y: Y_test}) #print("Train Accuracy: 0.99647886") #print("Test Accuracy: 0.99") print("Train Accuracy:", train_accuracy) print("Test Accuracy:", test_accuracy) #writer = tf.summary.FileWriter("/anaconda3/dnn/con1", tf.get_default_graph()) writer.close() return train_accuracy, test_accuracy, parameters
def timeGraph(gdef, batch_size=128, image_folder='images', latencyMS=30, powerCapW=50, result_file="ResultsLog.txt"): tf.logging.info("Starting execution") gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95) tf.reset_default_graph() g = tf.Graph() imageCounter = 0 outlist = [] with g.as_default(): imageString = tf.placeholder(tf.string, name='imageString') imagenstack = tf.stack(imageString) batch_size_dynamic = tf.placeholder(tf.int64, shape=(), name='batch_size_dynamic') dataset = tf.data.Dataset.from_tensor_slices(imagenstack) dataset = dataset.map(_parse_function) dataset = dataset.batch(batch_size_dynamic) dataset = dataset.repeat() iterator = dataset.make_initializable_iterator() next_element = iterator.get_next() out = tf.import_graph_def( graph_def=gdef, input_map={"input": next_element}, return_elements=["MobilenetV1/Predictions/Softmax"]) out = out[0].outputs[0] outlist.append(out) timings = [] with tf.Session(graph=g, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() skipSize = 0 processed_image = 0 StringofImage = [] for imageName in glob.glob(image_folder + '/*.*'): StringofImage.append(imageName) imageCounter = imageCounter + 1 listBatchSizeSteps = [x for x in range(1, 256)] MAXINDEX = len(listBatchSizeSteps) - 1 MININDEX = 0 minBS = MININDEX maxBS = MAXINDEX initialBS = 0 BestBS = MININDEX batchCounter = 0 ThroughputPerSecond = [-1] * N newbatchSize = listBatchSizeSteps[initialBS] resultsFile = open(result_file, 'w') resultsFile.write( "time stamp, Througput (image/sec), power, batch size, minBS, maxBS, BestBS, power cap, DVFS\n" ) listDVFSSteps = [544, 632, 734, 835, 949, 1063, 1189, 1303, 1430, 1531] minDVFS = 0 maxDVFS = 9 initialDVFS = 5 currentDVFS = 5 DVFSLevel = listDVFSSteps[initialDVFS] #DVFSLevel = listDVFSSteps[maxDVFS] os.system( "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615," + str(DVFSLevel)) two_second_start = time.time() firstTime = 0 while True: if (time.time() - two_second_start) > 3: two_second_start = time.time() #if power cap is meeting, do not do anything. we use 0.9 as a margin to avoid lots of fluctuations if max(powerReading) <= powerCapW and max(powerReading) >= ( 0.8 * powerCapW): powerReading[:] = [] pass elif max(powerReading) < ( 0.8 * powerCapW): # Power less than power cap powerReading[:] = [] if BestBS == MAXINDEX: print("Max BS, no further improvement") if currentDVFS != maxDVFS: currentDVFS = currentDVFS + 1 # increase DVFS by one step DVFSLevel = listDVFSSteps[currentDVFS] os.system( "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615," + str(DVFSLevel)) elif BestBS == maxBS: minBS = maxBS maxBS = MAXINDEX BestBS = int(math.ceil((minBS + maxBS) / float(2))) newbatchSize = listBatchSizeSteps[BestBS] else: minBS = BestBS maxBS = maxBS BestBS = int(math.ceil((minBS + maxBS) / float(2))) newbatchSize = listBatchSizeSteps[BestBS] elif max( powerReading) > powerCapW: # Power More than Power Cap powerReading[:] = [] if BestBS == MININDEX: print("Min BS, No Possible Solution") #resultsFile.write("No Possible Solution\n") if currentDVFS != minDVFS: currentDVFS = currentDVFS - 1 # decrease DVFS by one step DVFSLevel = listDVFSSteps[currentDVFS] os.system( "echo sudo_password | sudo -S nvidia-smi --applications-clocks=3615," + str(DVFSLevel)) elif BestBS == minBS: # It is stuck in a loop where there is no scape. Restart everything from beginning. maxBS = minBS minBS = MININDEX BestBS = int(math.floor((minBS + maxBS) / float(2))) newbatchSize = listBatchSizeSteps[BestBS] else: minBS = minBS maxBS = BestBS BestBS = int(math.floor((minBS + maxBS) / float(2))) newbatchSize = listBatchSizeSteps[BestBS] # For last run to make sure that the batch size is not greater than the remaining number of images if (processed_image + newbatchSize) > imageCounter: print("Entered If for processed_image") newbatchSize = imageCounter - processed_image StringImage_2 = [] #print(len(StringofImage)) for countertemp in range(newbatchSize): StringImage_2.append(StringofImage.pop(0)) tstart = time.time() sess.run(iterator.initializer, feed_dict={ batch_size_dynamic: newbatchSize, imageString: StringImage_2 }) val = sess.run(outlist, feed_dict={ batch_size_dynamic: newbatchSize, imageString: StringImage_2 }) timings.append(time.time() - tstart) # Reading power printLables = 0 # SET TO ONE FOR LABELS TO BE PRINTED if printLables == 1: if os.path.exists('resultLables.txt'): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not # highscore = open('resultLables.txt', append_write) for index1 in range(0, len(topX(val[0], f.topN)[1])): highscore.write( str(getLabels(labels, topX(val[0], f.topN)[1][index1]))) highscore.write("\n") highscore.close() ThroughputPerSecond.append( 1000 / ((timings[-1] * 1000) / newbatchSize) ) #first convert the time to milisecond (*1000), then divide by the number of processed image (newbatchsize) if len(powerReading) == 0: time.sleep(1) #maxPower = max(powerReading) resultsFile.write( str(time.time()) + "," + str(ThroughputPerSecond[-1]) + "," + str(max(powerReading)) + "," + str(newbatchSize) + "," + str(minBS) + "," + str(maxBS) + "," + str(BestBS) + "," + str(powerCapW) + "," + str(DVFSLevel) + "\n") #end of our new code processed_image = processed_image + newbatchSize #print("processed image = ", processed_image) #print("skip size is", skipSize) #print("batch size ", newbatchSize, " = ", timings[-1], " s\n\n") if processed_image == imageCounter: break skipSize = skipSize + newbatchSize #fileBatch.close() sess.close() tf.logging.info("Timing loop done!") #os.system("pkill nvidia-smi") return timings, True, val[0], None
def main(_): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir) # Create the model x = tf.placeholder(tf.float32, [None, 784]) w = tf.Variable(tf.zeros([784, 10])) b = tf.Variable(tf.zeros([10])) y = tf.matmul(x, w) + b # Define loss and optimizer y_ = tf.placeholder(tf.int64, [None]) # The raw formulation of cross-entropy, # # # can be numerically unstable. # # So here we use tf.compat.v1.losses.sparse_softmax_cross_entropy on the raw # logit outputs of 'y', and then average across the batch. cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy) config = tf.ConfigProto() #add npu config, enable offline train custom_op = config.graph_options.rewrite_options.custom_optimizers.add() custom_op.name = "NpuOptimizer" #enable offline train custom_op.parameter_map["use_off_line"].b = True run_metadata = tf.RunMetadata() sess = tf.compat.v1.Session(config=config) tf.global_variables_initializer().run(session=sess) # Train train_loops = 1000 for i in range(train_loops): batch_xs, batch_ys = mnist.train.next_batch(100) # Create a timeline for the last loop and export to json to view with # chrome://tracing/. if i == train_loops - 1: sess.run( train_step, feed_dict={ x: batch_xs, y_: batch_ys }, options=tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE), run_metadata=run_metadata) trace = timeline.Timeline(step_stats=run_metadata.step_stats) with open('/tmp/timeline.ctf.json', 'w') as trace_file: trace_file.write(trace.generate_chrome_trace_format()) else: sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys}) # Test trained model correct_prediction = tf.equal(tf.argmax(y, 1), y_) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) print( sess.run(accuracy, feed_dict={ x: mnist.test.images, y_: mnist.test.labels })) sess.close()
def train(): # Import input data INPUTS_DIR = os.getenv('VH_INPUTS_DIR', '/tmp/tensorflow/mnist/inputs') data_set_files = [ get_first_file(os.path.join(INPUTS_DIR, 'training-set-images')), get_first_file(os.path.join(INPUTS_DIR, 'training-set-labels')), get_first_file(os.path.join(INPUTS_DIR, 'test-set-images')), get_first_file(os.path.join(INPUTS_DIR, 'test-set-labels')), ] train_dir = os.getcwd() for file in data_set_files: copy2(file, train_dir) mnist = input_data.read_data_sets(train_dir, fake_data=FLAGS.fake_data) sess = tf.InteractiveSession() # Create a multilayer model. # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, 784], name='x-input') y_ = tf.placeholder(tf.int64, [None], name='y-input') with tf.name_scope('input_reshape'): image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) tf.summary.image('input', image_shaped_input, 10) # We can't initialize these variables to 0 - the network will get stuck. def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) all_weights = [] all_biases = [] def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer. It does a matrix multiply, bias add, and then uses relu to nonlinearize. It also sets up name scoping so that the resultant graph is easy to read, and adds a number of summary ops. """ # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope('weights'): weights = weight_variable([input_dim, output_dim]) variable_summaries(weights) all_weights.append(weights) with tf.name_scope('biases'): biases = bias_variable([output_dim]) variable_summaries(biases) all_biases.append(biases) with tf.name_scope('Wx_plus_b'): preactivate = tf.matmul(input_tensor, weights) + biases tf.summary.histogram('pre_activations', preactivate) activations = act(preactivate, name='activation') tf.summary.histogram('activations', activations) return activations hidden1 = nn_layer(x, 784, 500, 'layer1') with tf.name_scope('dropout'): keep_prob = tf.placeholder(tf.float32) tf.summary.scalar('dropout_keep_probability', keep_prob) dropped = tf.nn.dropout(hidden1, keep_prob) y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) with tf.name_scope('cross_entropy'): with tf.name_scope('total'): cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('train'): train_step = tf.train \ .AdamOptimizer(FLAGS.learning_rate) \ .minimize(cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), y_) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # Merge all the summaries and write them out to # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') tf.global_variables_initializer().run() # Train the model, and also write summaries. # Every 10th step, measure test-set accuracy, and write test summaries # All other steps, run train_step on training data, & add training summaries def feed_dict(train): """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) k = FLAGS.dropout else: xs, ys = mnist.test.images, mnist.test.labels k = 1.0 return {x: xs, y_: ys, keep_prob: k} for i in range(FLAGS.max_steps): if i % 10 == 0: # Record summaries and test-set accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print(json.dumps({'step': i, 'accuracy': acc.item()})) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) train_writer.add_summary(summary, i) _, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) print(json.dumps({'step': FLAGS.max_steps, 'accuracy': acc.item()})) train_writer.close() test_writer.close() # Saving weights and biases as outputs of the task. outputs_dir = os.getenv('VH_OUTPUTS_DIR', '/tmp/tensorflow/mnist/outputs') for i, ws in enumerate(all_weights): filename = os.path.join(outputs_dir, 'layer-{}-weights.csv'.format(i)) np.savetxt(filename, ws.eval(), delimiter=",") for i, bs in enumerate(all_biases): filename = os.path.join(outputs_dir, 'layer-{}-biases.csv'.format(i)) np.savetxt(filename, bs.eval(), delimiter=",")
def train(): dataset, testset = data_provider.config_to_slim_dataset( config=TRAINING_CONFIG, dataset_dir="./") # training data prefetch_queue = data_provider.slim_dataset_to_prefetch_queue( dataset, BATCH_SIZE) face_batch, label_batch = prefetch_queue.dequeue() face_batch = tf.cast(face_batch, tf.float32) tf.summary.image("face", face_batch[0:16], max_outputs=16) x = tf.placeholder(tf.uint8, shape=(None, 224, 224, 3)) y = tf.placeholder(tf.int64, shape=(None, 1)) if args.fine_tune: logit, trainable, total_reg_losses, _ = model_build.build_mobilenet_v1_debug( x, mobilenet_training=True, neuguen_training=True) print("fine tune") else: logit, trainable, total_reg_losses, _ = model_build.build_mobilenet_v1_debug( x) tf.summary.scalar("regularization_loss", tf.reduce_sum(total_reg_losses)) loss = model_build.build_loss(logit, y) tf.summary.scalar("cross_entropy_loss", loss) loss = loss + tf.reduce_sum(total_reg_losses) tf.summary.scalar("total_loss", loss) global_step = tf.train.create_global_step() train_op = model_build.build_train_op(loss, trainable, global_step) for var in tf.global_variables(): tf.summary.histogram(var.op.name, var) correct_prediction = tf.equal(tf.squeeze(y), tf.argmax(logit, 1)) accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar("batch_accuracy", accuracy) confusion_matrix_op = tf.confusion_matrix(tf.squeeze(y), tf.argmax(logit, 1)) session_config = tf.ConfigProto() session_config.gpu_options.allow_growth = True neuguen_saver = tf.train.Saver(max_to_keep=10) merge_summary = tf.summary.merge_all() save_path_fine_tune = "neuguen_model_fine_tune" if not os.path.exists(save_path_fine_tune): os.makedirs(save_path_fine_tune) save_path = "neuguen_model" if not os.path.exists(save_path): os.makedirs(save_path) with tf.Session(config=session_config) as session: coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(coord=coord) session.run(tf.global_variables_initializer()) if args.fine_tune: summary_writer = tf.summary.FileWriter(save_path_fine_tune, session.graph) model_build.restore_last_checkpoint(session, save_path) else: summary_writer = tf.summary.FileWriter(save_path, session.graph) model_build.restore_pretrained_mobilenet(session) for j in xrange(20): confusion_matrix = np.array([[0., 0.], [0., 0.]]) accuracy_avg = 0.0 for i in xrange(int(TRAINING_CONFIG["training_size"] / BATCH_SIZE)): faces, labels, step = session.run( [face_batch, label_batch, global_step]) if step % 100 == 99: if step % 10000 == 9999: run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, loss_value, accuracy_value, confusion, _ = session.run( [ merge_summary, loss, accuracy, confusion_matrix_op, train_op ], feed_dict={ x: faces, y: labels }, options=run_options, run_metadata=run_metadata) summary_writer.add_summary(summary, step) summary_writer.add_run_metadata( run_metadata, "step{0}".format(step)) else: summary, loss_value, accuracy_value, confusion, _ = session.run( [ merge_summary, loss, accuracy, confusion_matrix_op, train_op ], feed_dict={ x: faces, y: labels }) summary_writer.add_summary(summary, step) else: loss_value, accuracy_value, confusion, _ = session.run( [loss, accuracy, confusion_matrix_op, train_op], feed_dict={ x: faces, y: labels }) confusion_matrix = confusion_matrix + confusion accuracy_avg = accuracy_avg + (accuracy_value - accuracy_avg) / (i + 1) sys.stdout.write( "\r{0}--{1} training accuracy(ma):{2} ".format( j, i, accuracy_avg)) sys.stdout.flush() print("") print(confusion_matrix) if args.fine_tune: neuguen_saver.save(session, os.path.join(save_path_fine_tune, "neuguen.ckpt"), global_step=global_step) else: neuguen_saver.save(session, os.path.join(save_path, "neuguen.ckpt"), global_step=global_step) print("thread.join") coord.request_stop() coord.join(threads)
def main(): #-------------解析参数-------------# args = _parse_args() if args.cfg_file is not None: cfg_from_file(args.cfg_file) #读取args.cfg_file文件内容并融合到cfg中 pprint.pprint(cfg) #-------------任务相关配置-------------# os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID" os.environ['CUDA_VISBLE_DEVICES'] = cfg.GPUS tf.logging.set_verbosity(tf.logging.INFO) #设置日志级别 #-------------搭建计算图-------------# with tf.device('/cpu:0'): # 操作密集型放在CPU上进行 global_step = tf.get_variable('global_step', [], dtype=None, initializer=tf.constant_initializer(0), trainable=False) lr = tf.train.exponential_decay(cfg.TRAIN.LEARNING_RATE_BASE, global_step, cfg.TRAIN.DECAY_STEP, cfg.TRAIN.DECAY_RATE, staircase=True) # 学习率 tf.summary.scalar('learnrate', lr) opt = tf.train.MomentumOptimizer(lr, cfg.TRAIN.MOMENTUM) # 优化函数 #opt = tf.train.GradientDescentOptimizer(lr) # 优化函数 num_gpus = len(cfg.GPUS.split(',')) # 建立dataset,获取iterator reader.set_param( cfg.INPUT.DATA_DIR, cfg.INPUT.MODALITY, # flow模态读取方式与rgb稍有不同 cfg.VALID.SPLIT_PATH, cfg.VALID.BATCH_SIZE, num_segments=cfg.INPUT.NUM_SEGMENTS, new_length=cfg.INPUT.NEW_LENGTH, train_split_path=cfg.INPUT.SPLIT_PATH, train_batch_size=cfg.TRAIN.BATCH_SIZE, isTraining=True) ite_train, ite_valid = reader.get_dataset_iter() tsn_batch, label_batch = ite_train.get_next() tsn_batch_splits = tf.split(tsn_batch, num_or_size_splits=num_gpus, axis=0) label_batch_splits = tf.split(label_batch, num_or_size_splits=num_gpus, axis=0) tsn_valid_batch, label_valid_batch = ite_valid.get_next() # 在GPU上运行训练(并行) tower_grads = [] with tf.variable_scope(tf.get_variable_scope( )) as vscope: # 见https://github.com/tensorflow/tensorflow/issues/6220 for i in range(num_gpus): with tf.device('/gpu:%d' % i), tf.name_scope('GPU_%d' % i) as scope: # 获取数据,tsn_batch形式:(batch_size/num_gpus*num_seg*new_length) * h * w * num_channels tsn_batch_split, label_batch_split = tsn_batch_splits[ i], label_batch_splits[i] if cfg.INPUT.MODALITY == 'rgb': tsn_batch_split = tf.reshape(tsn_batch_split, [ cfg.TRAIN.BATCH_SIZE / num_gpus * cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, 224, 224, 3 ]) elif cfg.INPUT.MODALITY == 'flow': tsn_batch_split = tf.reshape(tsn_batch_split, [ cfg.TRAIN.BATCH_SIZE / num_gpus * cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, 224, 224, 2 ]) else: raise ValueError("modality must be one of rgb or flow") # 获取网络,并完成前传 with slim.arg_scope(inception_v2_arg_scope()): logits, _ = inception_v2( tsn_batch_split, num_classes=cfg.NUM_CLASSES, is_training=True, dropout_keep_prob=cfg.TRAIN.DROPOUT_KEEP_PROB, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, scope='InceptionV2', global_pool=False) tf.get_variable_scope().reuse_variables() logits = tf.reshape(logits, [ cfg.TRAIN.BATCH_SIZE / num_gpus, cfg.INPUT.NUM_SEGMENTS * cfg.INPUT.NEW_LENGTH, -1 ]) #tsn的特殊性决定 logits = tf.reduce_mean(logits, 1) # 取采样图片输出的平均值 # 做一个batch准确度的预测 prediction = tf.nn.softmax(logits) acc_batch = tf.reduce_mean( tf.cast( tf.equal(tf.argmax(prediction, 1), tf.argmax(label_batch_split, 1)), tf.float32)) tf.summary.scalar('acc_on_batch', acc_batch) # 求loss for variable in tf.global_variables(): if variable.name.find( 'weights' ) > 0: # 把参数w加入集合tf.GraphKeys.WEIGHTS,方便做正则化(此句必须放在正则化之前) tf.add_to_collection(tf.GraphKeys.WEIGHTS, variable) loss = tsn_loss(logits, label_batch_split, regularization=True) tf.summary.scalar('loss', loss) # 计算梯度,并由tower_grads收集 grads_and_vars = opt.compute_gradients( loss, var_list=tf.trainable_variables( )) # (gradient, variable)组成的列表 tower_grads.append(grads_and_vars) grads_and_vars = average_gradients(tower_grads) # 求取各GPU平均梯度 train_step = opt.apply_gradients(grads_and_vars, global_step=global_step) # 在GPU上运行验证(串行) with tf.variable_scope(tf.get_variable_scope( )) as vscope: # 见https://github.com/tensorflow/tensorflow/issues/6220 with tf.device('/gpu:0'), tf.name_scope('VALID') as scope: tf.get_variable_scope().reuse_variables() if cfg.INPUT.MODALITY == 'rgb': tsn_valid_batch = tf.reshape( tsn_valid_batch, [cfg.VALID.BATCH_SIZE * 25, 224, 224, 3]) elif cfg.INPUT.MODALITY == 'flow': tsn_valid_batch = tf.reshape( tsn_valid_batch, [cfg.VALID.BATCH_SIZE * 25, 224, 224, 2]) else: raise ValueError("modality must be one of rgb or flow") with slim.arg_scope(inception_v2_arg_scope()): logits_valid, _ = inception_v2( tsn_valid_batch, num_classes=cfg.NUM_CLASSES, is_training=False, dropout_keep_prob=cfg.TRAIN.DROPOUT_KEEP_PROB, min_depth=16, depth_multiplier=1.0, prediction_fn=slim.softmax, spatial_squeeze=True, reuse=None, scope='InceptionV2', global_pool=False) logits_valid = tf.reshape( logits_valid, [cfg.VALID.BATCH_SIZE, 25, -1]) #tsn的特殊性决定 logits_valid = tf.reduce_mean(logits_valid, 1) # 取采样图片输出的平均值 # 做一个batch准确度的预测 prediction_valid = tf.nn.softmax(logits_valid) acc_valid_batch = tf.reduce_mean( tf.cast( tf.equal(tf.argmax(prediction_valid, 1), tf.argmax(label_valid_batch, 1)), tf.float32)) merged = tf.summary.merge_all() # saver model_variables_map = {} for variable in tf.global_variables(): if variable.name.split('/')[0] == 'InceptionV2' and variable.name.find( 'Conv2d_1c_1x1') == -1 and variable.name.find( 'Momentum') == -1: model_variables_map[variable.name.replace(':0', '')] = variable print '####################################################' for i in model_variables_map.keys(): print i print '#####################################################' saver_model = tf.train.Saver( var_list=model_variables_map, max_to_keep=20) #不加载'InceptionV2/Logits/Conv2d_1c_1x1/'下的参数 #-------------启动Session-------------# # (预测验证集,求取精度) gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.8) config = tf.ConfigProto(gpu_options=gpu_options, allow_soft_placement=True) with tf.Session(config=config) as sess: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() joint_writer = tf.summary.FileWriter(cfg.SUMMARY_DIR, sess.graph) summary_writer = tf.summary.FileWriter(cfg.SUMMARY_DIR, sess.graph) #初始化变量(或加载pretrained models) tf.global_variables_initializer().run() saver_model.restore(sess, cfg.TRAIN.PRETRAINED_MODEL_NAME) sess.graph.finalize() start_time = time.time() for i in range(cfg.TRAIN.MAX_ITE): _, learnrate, loss_value, step, summary = sess.run( [train_step, lr, loss, global_step, merged], options=run_options, run_metadata=run_metadata) if i == 0: start_time = time.time() if i % 10 == 0: if i >= 1: end_time = time.time() avg_time = (end_time - start_time) / float(i + 1) print("Average time consumed per step is %0.2f secs." % avg_time) print( "After %d training step(s), learning rate is %g, loss on training batch is %g." % (step, learnrate, loss_value)) # 每个epoch验证一次,保存模型 if i % 100 == 0: print '#############################################' print 'valid and save model' accs = [] num = 0 for j in range(849): num += 1 acc = sess.run(acc_valid_batch) accs.append(acc) print num acc_valid = np.mean(np.array(accs)) print 'accuracy on validation set is %0.4f' % acc_valid print 'saving model...' saver_model.save(sess, cfg.TRAIN.SAVED_MODEL_PATTERN, global_step=global_step) print 'successfully saved !' print '#############################################' joint_writer.add_run_metadata(run_metadata, 'step%03d' % i) summary_writer.add_summary(summary, i) end_time = time.time() #print '%dth time step,consuming %f secs'%(i, start_time-end_time) summary_writer.close()
def train(model, x_train, y_train, x_validation, y_validation, epochs_list, name, batch_size = 64, learning_rate = 1e-3, lr_decay_ratio = 0.1, data_augmentation = True ): #opt = keras.optimizers.Adam(lr=learning_rate, epsilon=1e-08) opt = keras.optimizers.SGD(lr=learning_rate, momentum=0.9, nesterov=True) run_options = None run_metadata = None if profiling: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'], options=run_options, run_metadata=run_metadata) else: model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) #to enable tensorboard #tensorboard = TensorBoard(log_dir='./logs', histogram_freq=10, write_graph=False, write_grads=True, write_images=True) #tensorboard --logdir=C:\...\logs filepath = name + 'model-ep{epoch:04d}-loss{loss:.3f}-acc{acc:.3f}-val_loss{val_loss:.3f}-val_acc{val_acc:.3f}.h5' checkpoint = ModelCheckpoint(filepath, verbose=1, save_best_only=True, period=10) def schedule(epoch): lr = learning_rate; for epochs in epochs_list: if epoch >= epochs: lr *= lr_decay_ratio else: break return lr lr_scheduler = LearningRateScheduler(schedule, verbose=1) if not data_augmentation: print('Not using data augmentation.') model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs_list[-1], validation_data=(x_validation, y_validation), callbacks=[lr_scheduler, checkpoint], #, callbacks=[tensorboard] shuffle=True) else: print('Using real-time data augmentation.') # This will do preprocessing and realtime data augmentation: datagen = ImageDataGenerator( horizontal_flip=True, width_shift_range=0.125, height_shift_range=0.125, fill_mode='constant') datagen.fit(x_train) # Fit the model on the batches generated by datagen.flow(). model.fit_generator(datagen.flow(x_train, y_train, batch_size=batch_size), steps_per_epoch=x_train.shape[0] // batch_size, epochs=epochs_list[-1], callbacks=[lr_scheduler, checkpoint], #, callbacks=[tensorboard]) validation_data=(x_validation, y_validation)) if mem_stat: sess = K.get_session() print(sess.run(tf.contrib.memory_stats.MaxBytesInUse())) # current usage print(sess.run(tf.contrib.memory_stats.BytesInUse())) if profiling: trace = timeline.Timeline(step_stats=run_metadata.step_stats) with open('timeline.densenet.json', 'w') as f: f.write(trace.generate_chrome_trace_format(show_memory=True))
def main(): args = get_arguments() try: directories = validate_directories(args) except ValueError as e: print("Some arguments are wrong:") print(str(e)) return logdir = directories['logdir'] restore_from = directories['restore_from'] # Even if we restored the model, we will treat it as new training # if the trained model is written into an arbitrary location. is_overwritten_training = logdir != restore_from with open(args.wavenet_params, 'r') as f: wavenet_params = json.load(f) # Create coordinator. coord = tf.train.Coordinator() # Load raw waveform from VCTK corpus. with tf.name_scope('create_inputs'): # Allow silence trimming to be skipped by specifying a threshold near # zero. silence_threshold = args.silence_threshold if args.silence_threshold > \ EPSILON else None gc_enabled = args.gc_channels is not None reader = AudioReader( audio_dir=args.data_dir, coord=coord, sample_rate=wavenet_params["sample_rate"], gc_enabled=gc_enabled, receptive_field=WaveNetModel.calculate_receptive_field( wavenet_params["filter_width"], wavenet_params["dilations"], wavenet_params["scalar_input"], wavenet_params["initial_filter_width"]), sample_size=args.sample_size, mfsc_dim=wavenet_params["MFSC_channels"], ap_dim=wavenet_params["AP_channels"], F0_dim=wavenet_params["F0_channels"], phone_dim=wavenet_params["phones_channels"], phone_pos_dim=wavenet_params["phone_pos_channels"], silence_threshold=silence_threshold) ap_mfsc_batch, lc_batch = reader.dequeue(args.batch_size) # print ("mfsc_batch_shape:", mfsc_batch.get_shape().as_list()) if gc_enabled: gc_id_batch = reader.dequeue_gc(args.batch_size) else: gc_id_batch = None # Create network. net = WaveNetModel( batch_size=args.batch_size, dilations=wavenet_params["dilations"], filter_width=wavenet_params["filter_width"], residual_channels=wavenet_params["residual_channels"], dilation_channels=wavenet_params["dilation_channels"], skip_channels=wavenet_params["skip_channels"], use_biases=wavenet_params["use_biases"], scalar_input=wavenet_params["scalar_input"], initial_filter_width=wavenet_params["initial_filter_width"], histograms=args.histograms, global_condition_channels=args.gc_channels, global_condition_cardinality=reader.gc_category_cardinality, MFSC_channels=wavenet_params["MFSC_channels"], AP_channels=wavenet_params["AP_channels"], F0_channels=wavenet_params["F0_channels"], phone_channels=wavenet_params["phones_channels"], phone_pos_channels=wavenet_params["phone_pos_channels"]) if args.l2_regularization_strength == 0: args.l2_regularization_strength = None # pdb.set_trace() loss = net.loss( input_batch= ap_mfsc_batch, # audio_batch shape: [receptive_filed + sample_size, 1] lc_batch=lc_batch, global_condition_batch=gc_id_batch, # gc_id_batch shape: scalar l2_regularization_strength=args.l2_regularization_strength) optimizer = optimizer_factory[args.optimizer]( learning_rate=args.learning_rate, momentum=args.momentum) trainable = tf.trainable_variables() optim = optimizer.minimize(loss, var_list=trainable) # Set up logging for TensorBoard. writer = tf.summary.FileWriter(logdir) writer.add_graph(tf.get_default_graph()) run_metadata = tf.RunMetadata() summaries = tf.summary.merge_all() # Set up session sess = tf.Session(config=tf.ConfigProto(log_device_placement=False)) init = tf.global_variables_initializer() sess.run(init) # Saver for storing checkpoints of the model. saver = tf.train.Saver(var_list=tf.trainable_variables(), max_to_keep=args.max_checkpoints) try: saved_global_step = load(saver, sess, restore_from) if is_overwritten_training or saved_global_step is None: # The first training step will be saved_global_step + 1, # therefore we put -1 here for new or overwritten trainings. saved_global_step = -1 except: print("Something went wrong while restoring checkpoint. " "We will terminate training to avoid accidentally overwriting " "the previous model.") raise threads = tf.train.start_queue_runners(sess=sess, coord=coord) reader.start_threads(sess) # pdb.set_trace() step = None last_saved_step = saved_global_step try: for step in range(saved_global_step + 1, args.num_steps): start_time = time.time() # acous, lc, loss_val = sess.run([ap_mfsc_batch, lc_batch, loss]) # print ("acous shape into net:", acous.shape) # print ("lc shape into net:", lc.shape) # print ("loss_val:", loss_val) # print("ap:", acous[0,:10, :4]) # print("mfsc", acous[0,:10,4:]) # print("ap_mfsc", acous[0,:10]) # pdb.set_trace() if args.store_metadata and step % 50 == 0: # Slow run that stores extra information for debugging. print('Storing metadata') run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) # acous, F0 = sess.run([audio_batch, F0_batch], options=run_options,run_metadata=run_metadata) # print acous.shape # print F0.shape summary, loss_value, _ = sess.run([summaries, loss, optim], options=run_options, run_metadata=run_metadata) writer.add_summary(summary, step) writer.add_run_metadata(run_metadata, 'step_{:04d}'.format(step)) tl = timeline.Timeline(run_metadata.step_stats) timeline_path = os.path.join(logdir, 'timeline.trace') with open(timeline_path, 'w') as f: f.write(tl.generate_chrome_trace_format(show_memory=True)) else: summary, loss_value, _ = sess.run([summaries, loss, optim]) # network_input_return_val = sess.run(network_input_return) writer.add_summary(summary, step) # print("network_input_return_shape:", network_input_return_val[0,0]) # pdb.set_trace() duration = time.time() - start_time if step % 10 == 0: print('step {:d} - loss = {:.3f}, ({:.3f} sec/step)'.format( step, loss_value, duration)) # print ("F0:", F0_val[0,0]) if step % args.checkpoint_every == 0: save(saver, sess, logdir, step) last_saved_step = step except KeyboardInterrupt: # Introduce a line break after ^C is displayed so save message # is on its own line. print() # finally: if step > last_saved_step: save(saver, sess, logdir, step) coord.request_stop() coord.join(threads)
def main(): # if tf.__version__ != "1.0.0": # raise Exception("Tensorflow version 1.0.0 required") if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test" or a.mode == "export": if a.checkpoint is None: raise Exception("checkpoint required for test mode") # load some options from the checkpoint options = {"which_direction", "ngf", "ndf"} with open(os.path.join(a.checkpoint, "options.json")) as f: for key, val in json.loads(f.read()).items(): if key in options: print("loaded", key, "=", val) setattr(a, key, val) # disable these features in test mode a.scale_size = CROP_SIZE a.flip = False for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) if a.mode == "export": # export the generator to a meta graph that can be imported later for standalone generation input = tf.placeholder(tf.string, shape=[1]) input_data = tf.decode_base64(input[0]) input_image = tf.image.decode_png(input_data) # remove alpha channel if present input_image = input_image[:, :, :3] input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32) input_image.set_shape([CROP_SIZE, CROP_SIZE, 3]) batch_input = tf.expand_dims(input_image, axis=0) with tf.variable_scope("generator") as scope: batch_output = deprocess( create_generator(preprocess(batch_input), 3)) output_image = tf.image.convert_image_dtype(batch_output, dtype=tf.uint8)[0] if a.output_filetype == "png": output_data = tf.image.encode_png(output_image) elif a.output_filetype == "jpeg": output_data = tf.image.encode_jpeg(output_image, quality=80) else: raise Exception("invalid filetype") output = tf.convert_to_tensor([tf.encode_base64(output_data)]) key = tf.placeholder(tf.string, shape=[1]) inputs = {"key": key.name, "input": input.name} tf.add_to_collection("inputs", json.dumps(inputs)) outputs = { "key": tf.identity(key).name, "output": output.name, } tf.add_to_collection("outputs", json.dumps(outputs)) init_op = tf.global_variables_initializer() restore_saver = tf.train.Saver() export_saver = tf.train.Saver() config = tf.ConfigProto() config.gpu_options.allow_growth = True with tf.Session(config=config) as sess: sess.run(init_op) print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) restore_saver.restore(sess, checkpoint) print("exporting model") export_saver.export_meta_graph( filename=os.path.join(a.output_dir, "export.meta")) export_saver.save(sess, os.path.join(a.output_dir, "export"), write_meta_graph=False) return examples = load_examples() print("examples count = %d" % examples.count) # inputs and targets are [batch_size, height, width, channels] net1 = vgg16.Vgg16() net2 = vgg16.Vgg16() model = create_model(examples.inputs, examples.targets, net1, net2) # undo colorization splitting on images that we use for display/output inputs = deprocess(examples.inputs) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) def convert(image): if a.aspect_ratio != 1.0: # upscale to correct aspect ratio size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))] image = tf.image.resize_images( image, size=size, method=tf.image.ResizeMethod.BICUBIC) return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True) # reverse any processing on images so they can be written to disk or displayed to user with tf.name_scope("convert_inputs"): converted_inputs = convert(inputs) with tf.name_scope("convert_targets"): converted_targets = convert(targets) with tf.name_scope("convert_outputs"): converted_outputs = convert(outputs) with tf.name_scope("encode_images"): display_fetches = { "paths": examples.paths, "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"), "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"), "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"), } # summaries with tf.name_scope("inputs_summary"): tf.summary.image("inputs", converted_inputs) with tf.name_scope("targets_summary"): tf.summary.image("targets", converted_targets) with tf.name_scope("outputs_summary"): tf.summary.image("outputs", converted_outputs) with tf.name_scope("predict_real_summary"): tf.summary.image( "predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8)) with tf.name_scope("predict_fake_summary"): tf.summary.image( "predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8)) tf.summary.scalar("discriminator_loss", model.discrim_loss) tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN) tf.summary.scalar("generator_loss_L1", model.gen_loss_L1) tf.summary.scalar("generator_loss_tv", model.gen_loss_tv) tf.summary.scalar("generator_loss_f", model.gen_loss_f) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars: tf.summary.histogram(var.op.name + "/gradients", grad) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum( [tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) config = tf.ConfigProto() config.gpu_options.allow_growth = True with sv.managed_session(config=config) as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) saver.restore(sess, checkpoint) max_steps = 2**32 if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs if a.max_steps is not None: max_steps = a.max_steps if a.mode == "test": # testing # at most, process the test data once max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) filesets = save_images(results) for i, f in enumerate(filesets): print("evaluated image", f["name"]) index_path = append_index(filesets) print("wrote index at", index_path) else: # training start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } if should(a.progress_freq): fetches["discrim_loss"] = model.discrim_loss fetches["gen_loss_GAN"] = model.gen_loss_GAN fetches["gen_loss_L1"] = model.gen_loss_L1 fetches["gen_loss_tv"] = model.gen_loss_tv fetches["gen_loss_f"] = model.gen_loss_f if should(a.summary_freq): fetches["summary"] = sv.summary_op if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata( run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print( "progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("discrim_loss", results["discrim_loss"]) print("gen_loss_GAN", results["gen_loss_GAN"]) print("gen_loss_L1", results["gen_loss_L1"]) print("gen_loss_tv", results["gen_loss_tv"]) print("gen_loss_f", results["gen_loss_f"]) if should(a.save_freq): print("saving model") saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break
def main(_): tic = time.time() tf.logging.set_verbosity(tf.logging.INFO) if not FLAGS.dataset_dir: raise ValueError('You must supply the dataset directory with --dataset_dir') # init net_name_scope_pruned = FLAGS.net_name_scope_pruned net_name_scope_checkpoint = FLAGS.net_name_scope_checkpoint block_names = valid_block_names kept_percentages_dict = get_kept_percentages_dict_from_path(FLAGS.checkpoint_path) kept_percentages = sorted(map(float, FLAGS.kept_percentages.split(','))) # check networks with the kps are pre-trained. for kp in kept_percentages: if kp not in kept_percentages_dict: raise Error('kept_percentage='+str(kp)+' not in folder:'+ FLAGS.checkpoint_path) num_options = len(kept_percentages) num_units = len(block_names) print('num_options=%d, num_blocks=%d' %(num_options, num_units)) print('HG: total number of configurations=%d' %(num_options**num_units)) if FLAGS.configuration_type =='sample': configs = get_sampled_configurations(num_units, num_options, FLAGS.total_num_configurations) elif FLAGS.configuration_type == 'special': configs = get_special_configurations(num_units, num_options) num_configurations = len(configs) #Getting MPI rank integer # comm = MPI.COMM_WORLD # rank = comm.Get_rank() # if rank >= num_configurations: # print("ERROR: rank(%d) > num_configurations(%d)" %(rank, num_configurations)) # return rank = 0 FLAGS.configuration_index = FLAGS.start_configuration_index + rank config = configs[FLAGS.configuration_index] print('HG: kept_percentages=%s, num_configs=%d, start_config_index=%d, rank=%d, config_index=%d' \ %(str(kept_percentages), num_configurations, FLAGS.start_configuration_index, rank, FLAGS.configuration_index)) # prepare for training with the specific config kept_percentage = config_to_kept_percentage_sequence(config, block_names, kept_percentages) prune_info = kept_percentage_sequence_to_prune_info(kept_percentage, block_names) print('HG: prune_info:') pprint(prune_info) # prepare file system results_dir = os.path.join(FLAGS.train_dir, "id"+str(FLAGS.configuration_index)) #+'_'+str(FLAGS.max_number_of_steps)) train_dir = os.path.join(results_dir, 'train') if (not FLAGS.continue_training) or (not tf.train.latest_checkpoint(train_dir)): print('Start a new training') prepare_file_system(train_dir) else: print('Continue training') def write_detailed_info(info): with open(os.path.join(train_dir, 'train_details.txt'), 'a') as f: f.write(info+'\n') info = 'train_dir: '+ train_dir+'\n' info += 'options:'+str(kept_percentages)+'\n' info += 'configuration: '+ str(config)+'\n' info += 'kept_percentage: ' + str(kept_percentage) print(info) write_detailed_info(info) with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig( num_clones=FLAGS.num_clones, clone_on_cpu=FLAGS.clone_on_cpu, replica_id=FLAGS.task, num_replicas=FLAGS.worker_replicas, num_ps_tasks=FLAGS.num_ps_tasks) ###################### # Select the dataset # ###################### dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.train_dataset_name, FLAGS.dataset_dir) test_dataset = dataset_factory.get_dataset( FLAGS.dataset_name, FLAGS.test_dataset_name, FLAGS.dataset_dir) batch_queue = train_inputs(dataset, deploy_config, FLAGS) test_images, test_labels = test_inputs(test_dataset, deploy_config, FLAGS) images, labels = batch_queue.dequeue() ###################### # Select the network# ###################### network_fn_pruned = nets_factory.get_network_fn_pruned( FLAGS.model_name, num_classes=(dataset.num_classes - FLAGS.labels_offset), weight_decay=FLAGS.weight_decay) #################### # Define the model # #################### logits_train,_ = network_fn_pruned(images, prune_info = prune_info, is_training=True, is_local_train=False, reuse_variables=False, scope = net_name_scope_pruned) logits_eval, _ = network_fn_pruned(test_images, prune_info = prune_info, is_training=False, is_local_train=False, reuse_variables=True, scope = net_name_scope_pruned) cross_entropy = add_cross_entropy(logits_train, labels) correct_prediction = add_correct_prediction(logits_eval, test_labels) ############################# # Specify the loss functions # ############################# collection_name = 'subgraph_losses' tf.add_to_collection(collection_name, cross_entropy) # get regularization loss regularization_losses = get_regularization_losses_within_scopes() print_list('regularization_losses', regularization_losses) # total loss and its summary total_loss = tf.add_n(tf.get_collection(collection_name), name='total_loss') for l in tf.get_collection(collection_name)+[total_loss]: tf.summary.scalar(l.op.name+'/summary', l) ######################################### # Configure the optimization procedure. # ######################################### with tf.device(deploy_config.variables_device()): global_step = tf.Variable(0, trainable=False, name='global_step') with tf.device(deploy_config.optimizer_device()): learning_rate = configure_learning_rate(dataset.num_samples, global_step, FLAGS) optimizer = configure_optimizer(learning_rate, FLAGS) tf.summary.scalar('learning_rate', learning_rate) ############################# # Add train operation # ############################# variables_to_train = get_trainable_variables_within_scopes() train_op = add_train_op(optimizer, total_loss, global_step, var_list=variables_to_train) print_list("variables_to_train", variables_to_train) # Gather update_ops: the updates for the batch_norm variables created by network_fn_pruned. update_ops = get_update_ops_within_scopes() print_list("update_ops", update_ops) update_ops.append(train_op) update_op = tf.group(*update_ops) with tf.control_dependencies([update_op]): train_tensor = tf.identity(total_loss, name='train_op') # add summary op summary_op = tf.summary.merge_all() print("HG: trainable_variables=", len(tf.trainable_variables())) print("HG: model_variables=", len(tf.model_variables())) print("HG: global_variables=", len(tf.global_variables())) # print_list('model_variables but not trainable variables', list(set(tf.model_variables()).difference(tf.trainable_variables()))) # print_list('global_variables but not model variables', list(set(tf.global_variables()).difference(tf.model_variables()))) # get train scopes for each kept_percentage block_names_dict = {} for block_name, block_kept_percentage in zip(block_names, kept_percentage): if block_kept_percentage not in block_names_dict: block_names_dict[block_kept_percentage] = [] block_names_dict[block_kept_percentage].append(block_name) #print_list("train_scopes", train_scopes) print('HG: block_names_dict:') pprint(block_names_dict) sess_config = tf.ConfigProto(intra_op_parallelism_threads=16, inter_op_parallelism_threads=16) with tf.Session(config=sess_config) as sess: ########################### # prepare for filewritter # ########################### train_writer = tf.summary.FileWriter(train_dir, sess.graph) # if restart the training or there is no checkpoint in the train_dir if (not FLAGS.continue_training) or (not tf.train.latest_checkpoint(train_dir)): ################################################# # Restore pruned model variable values. # ################################################# all_variables_to_train = [] for block_kept_percentage, block_name in block_names_dict.items(): print('HG: kept_percentage', block_kept_percentage) checkpoint_path = os.path.join( FLAGS.checkpoint_path, kept_percentages_dict[block_kept_percentage][0], 'train') # 'model.ckpt-'+str(FLAGS.local_train_steps)) variables_to_train = {re.sub(net_name_scope_pruned, net_name_scope_pruned+"_p"+str(block_kept_percentage), v.op.name): v for v in get_model_variables_with_block_names(net_name_scope_pruned, block_name)} print_list("restore pruned model variables", variables_to_train.values()) load_checkpoint(sess, checkpoint_path, var_list=variables_to_train) all_variables_to_train.extend(variables_to_train.values()) ################################################# # Restore orignal model variable values. # ################################################# variables_to_restore = {re.sub(net_name_scope_pruned, net_name_scope_checkpoint, v.op.name): v for v in get_model_variables_within_scopes() if v not in set(all_variables_to_train)} print_list("restore original model variables", variables_to_restore.values()) load_checkpoint(sess, checkpoint_path, var_list=variables_to_restore) else: ########################################### ## Restore all variables from checkpoint ## ########################################### variables_to_restore = get_global_variables_within_scopes() load_checkpoint(sess, train_dir, var_list = variables_to_restore) ################################################# # init unitialized global variable. # ################################################# variables_to_init = get_global_variables_within_scopes(sess.run( tf.report_uninitialized_variables() )) print_list("init unitialized variables", variables_to_init) sess.run( tf.variables_initializer(variables_to_init) ) init_global_step_value = sess.run(global_step) print('initial global step: ', init_global_step_value) if init_global_step_value >= FLAGS.max_number_of_steps: print('Exit: init_global_step_value (%d) >= FLAGS.max_number_of_steps (%d)' \ %(init_global_step_value, FLAGS.max_number_of_steps)) return ########################### # Record CPU usage # ########################### mpstat_output_filename = os.path.join(train_dir, "cpu-usage.log") os.system("mpstat -P ALL 1 > " + mpstat_output_filename + " 2>&1 &") ########################### # Kicks off the training. # ########################### coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) saver = tf.train.Saver(max_to_keep=FLAGS.max_to_keep) print('HG: # of threads=', len(threads)) duration = 0 duration_cnt = 0 train_time = 0 train_only_cnt = 0 print("start to train at:", datetime.now()) for i in range(init_global_step_value, FLAGS.max_number_of_steps+1): #train_step = i+FLAGS.local_train_steps train_step = i # run optional meta data, or summary, while run train tensor if i > init_global_step_value: #if i < FLAGS.max_number_of_steps: # run metadata and train if i % FLAGS.runmeta_every_n_steps == FLAGS.runmeta_every_n_steps-1: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() loss_value = sess.run(train_tensor, options = run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%d-train' % i) # Create the Timeline object, and write it to a json file fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open(os.path.join(train_dir, 'timeline_'+str(i)+'.json'), 'w') as f: f.write(chrome_trace) # record summary and train elif i % FLAGS.summary_every_n_steps==0: train_summary, loss_value = sess.run([summary_op, train_tensor]) train_writer.add_summary(train_summary, train_step) # train only else: start_time = time.time() loss_value = sess.run(train_tensor) train_only_cnt+=1 train_time += time.time() - start_time duration_cnt +=1 duration += time.time()- start_time if i%FLAGS.log_every_n_steps==0 and duration_cnt > 0: log_frequency = duration_cnt examples_per_sec = log_frequency * FLAGS.batch_size / duration sec_per_batch = float(duration /log_frequency) summary = tf.Summary() summary.value.add(tag='examples_per_sec', simple_value=examples_per_sec) summary.value.add(tag='sec_per_batch', simple_value=sec_per_batch) train_writer.add_summary(summary, train_step) format_str = ('%s: step %d, loss = %.3f (%.1f examples/sec; %.3f sec/batch)') print(format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch)) duration = 0 duration_cnt = 0 info= format_str % (datetime.now(), i, loss_value, examples_per_sec, sec_per_batch) write_detailed_info(info) else: # run only total loss when i=0 train_summary, loss_value = sess.run([summary_op, total_loss]) #loss_value = sess.run(total_loss) train_writer.add_summary(train_summary, train_step) format_str = ('%s: step %d, loss = %.3f') print(format_str % (datetime.now(), i, loss_value)) info= format_str % (datetime.now(), i, loss_value) write_detailed_info(info) # record the evaluation accuracy is_last_step = (i==FLAGS.max_number_of_steps) if i%FLAGS.evaluate_every_n_steps==0 or is_last_step: test_accuracy, run_metadata = evaluate_accuracy(sess, coord, test_dataset.num_samples, test_images, test_labels, test_images, test_labels, correct_prediction, FLAGS.test_batch_size, run_meta=False) summary = tf.Summary() summary.value.add(tag='accuracy', simple_value=test_accuracy) train_writer.add_summary(summary,train_step) info = ('%s: step %d, test_accuracy = %.6f') % (datetime.now(), train_step, test_accuracy) print(info) write_detailed_info(info) ########################### # Save model parameters . # ########################### save_path = saver.save(sess, os.path.join(train_dir, 'model.ckpt-'+str(i))) print("HG: Model saved in file: %s" % save_path) coord.request_stop() coord.join(threads) total_time = time.time()-tic train_speed = train_time /train_only_cnt train_time = (FLAGS.max_number_of_steps)*train_speed info = "HG: training speed(sec/batch): %.6f\n" %(train_speed) info += "HG: training time(min): %.1f, total time(min): %.1f \n" %( train_time/60.0, total_time/60.0) print(info) write_detailed_info(info)
def _train_loop(self, model, data=None, sess=None, indxs=None): """Training function for adam optimizer to clean up code in `train`""" if self.run_diagnostics: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() else: run_options = None run_metadata = None if self.early_stop > 0: self.early_stop_params = { 'prev_costs': np.multiply(np.ones(self.early_stop), np.nan), 'best_epoch': 0, 'best_cost': np.inf, 'chkpted': False, 'stop_training': False } num_batches = indxs['train'].shape[0] // self.batch_size np.random.seed(model.np_seed) # start training loop self.epoch = np.nan for epoch in range(self.epochs_training): self.epoch = epoch # shuffle data before each pass train_indxs_perm = np.random.permutation(indxs['train']) # pass through dataset once start = time.time() for batch in range(num_batches): # get training indices for this batch batch_indxs = train_indxs_perm[batch * self.batch_size:(batch + 1) * self.batch_size] # one step of optimization routine feed_dict = self._get_feed_dict(model=model, data=data, batch_indxs=batch_indxs) # print(model.gen_net.networks[0].layers[0].kernel.eval(sess)) # print(model.gen_net.Q_sqrt.eval(sess)) sess.run(model.train_step, feed_dict=feed_dict) epoch_time = time.time() - start # print training updates if self.epochs_display is not None and (epoch % self.epochs_display == self.epochs_display - 1 or epoch == 0): self._train_print_updates(sess, model, data, indxs, epoch_time) # save model checkpoints if self.epochs_ckpt is not None and (epoch % self.epochs_ckpt == self.epochs_ckpt - 1 or epoch == 0): checkpoint_file = os.path.join(self.checkpoints_dir, str('epoch_%05g.ckpt' % epoch)) model.checkpoint_model(sess, checkpoint_file=checkpoint_file, print_filepath=True) # store most recent checkpoint as model attribute model.checkpoint = checkpoint_file # save model summaries if self.epochs_summary is not None and (epoch % self.epochs_summary == self.epochs_summary - 1 or epoch == 0): self._train_save_summaries(sess, model, data, indxs, run_options, run_metadata) # perform early stopping if self.early_stop > 0: self._train_early_stop(sess, model, data, indxs) if self.early_stop_params['stop_training']: break # perform final checkpoint if not early stopping (handles case on own) if self.epochs_ckpt is np.inf and self.early_stop == 0: checkpoint_file = os.path.join(self.checkpoints_dir, str('epoch_%05g.ckpt' % self.epoch)) model.checkpoint_model(sess, checkpoint_file=checkpoint_file, print_filepath=True) # store most recent checkpoint as model attribute model.checkpoint = checkpoint_file
def train(_): # create new log files if tf.gfile.Exists(FLAGS.log_dir): tf.gfile.DeleteRecursively(FLAGS.log_dir) tf.gfile.MakeDirs(FLAGS.log_dir) tf.reset_default_graph() tf.set_random_seed(2) np.random.seed(2) # Import data mnist = input_data.read_data_sets("MNIST-data/", one_hot=True) X_train = mnist.train.images.reshape(mnist.train.images.shape[0], 28, 28, 1) y_train = mnist.train.labels.astype(np.int64) batch_size = 500 gen = ImageDataGenerator(rotation_range=6, width_shift_range=0.06, shear_range=0.27, height_shift_range=0.06, zoom_range=0.06) train_gen = gen.flow(X_train, y_train, batch_size=batch_size, seed=0) # Create a multilayer model. sess = tf.InteractiveSession() # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, 784], name='x-input') y_ = tf.placeholder(tf.int64, [None,10], name='y-input') def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def conv2d(x, W): return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME') x_image = tf.reshape(x, [-1,28,28,1]) #conv1 W_conv1 = weight_variable([3, 3, 1, 32]) b_conv1 = bias_variable([32]) conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1) print ("conv1" + str(conv1.get_shape())) #conv2 W_conv2 = weight_variable([3, 3, 32, 64]) b_conv2 = bias_variable([64]) conv2 = tf.nn.relu(conv2d(conv1, W_conv2) + b_conv2) print ("conv2" + str(conv2.get_shape())) #pool1 pool1 = tf.nn.max_pool(conv2, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME') print ("pool1" + str(pool1.get_shape())) #conv3 W_conv3 = weight_variable([3, 3, 64, 64]) b_conv3 = bias_variable([64]) conv3 = tf.nn.relu(conv2d(pool1, W_conv3) + b_conv3) print ("conv3" + str(conv3.get_shape())) #conv4 W_conv4 = weight_variable([3, 3, 64, 64]) b_conv4 = bias_variable([64]) conv4 = tf.nn.relu(conv2d(conv3, W_conv4) + b_conv4) print ("conv4" + str(conv4.get_shape())) #pool2 pool2 = tf.nn.max_pool(conv4, ksize=[1,2,2,1], strides=[1,2,2,1], padding='SAME') print ("pool1" + str(pool2.get_shape())) # dense1 with flatten W_fc1 = weight_variable([28 * 28 * 16, 512]) b_fc1 = bias_variable([512]) flat = tf.reshape(conv3, [-1, 28*28*16]) fc1 = tf.nn.relu(tf.matmul(flat, W_fc1) + b_fc1) print ("fc1" + str(fc1.get_shape())) keep_prob = tf.placeholder(tf.float32) fc1_drop = tf.nn.dropout(fc1, keep_prob) print ("fc1_drop" + str(fc1_drop.get_shape())) W_fc2 = weight_variable([512, 10]) b_fc2 = bias_variable([10]) y = tf.nn.softmax(tf.matmul(fc1_drop, W_fc2) + b_fc2) print ("y" + str(y.get_shape())) with tf.name_scope('cross_entropy'): with tf.name_scope('total'): cross_entropy = tf.reduce_mean(-tf.reduce_sum( tf.cast(y_, tf.float32) * tf.log(y), reduction_indices=[1])) # cross_entropy = tf.losses.sparse_softmax_cross_entropy(labels=y_, logits=y) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize(cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_,1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # Merge all the summaries and write them out merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph,flush_secs=10) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test',flush_secs=10) tf.global_variables_initializer().run() def feed_dict(train): if train: xs, ys = next(train_gen) xs = xs.reshape(batch_size, 28*28) k = FLAGS.dropout else: xs, ys = mnist.test.images, mnist.test.labels k = 1.0 return {x: xs, y_: ys, keep_prob: k} for i in range(FLAGS.max_steps+1): if i % 100 == 0: # Record summaries and test-set accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print('%s: Accuracy at step %s: %s' % (datetime.now(), i, acc)) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions() run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) # print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) if i % 10 == 0: train_writer.add_summary(summary, i) train_writer.close() test_writer.close()
def main(argv, neptune_logger=None): cfg = BaseConfig().parse(argv) os.environ["CUDA_VISIBLE_DEVICES"] = cfg.gpu save_model_dir = cfg.checkpoint_dir if neptune_logger: neptune_logger.create_experiment(name=save_model_dir.split('/')[-1], params=vars(cfg)) print(save_model_dir) model_basename = os.path.basename(save_model_dir) touch_dir(save_model_dir) args_file = os.path.join(cfg.checkpoint_dir, 'args.json') with open(args_file, 'w') as f: json.dump(vars(cfg), f, ensure_ascii=False, indent=2, sort_keys=True) # os_utils.touch_dir(save_model_dir) log_file = os.path.join(cfg.checkpoint_dir, cfg.log_filename + '.txt') os_utils.touch_dir(cfg.checkpoint_dir) logger = log_utils.create_logger(log_file) img_generator_class = locate(cfg.db_tuple_loader) args = dict() args['db_path'] = cfg.db_path args['tuple_loader_queue_size'] = cfg.tuple_loader_queue_size args['preprocess_func'] = cfg.preprocess_func args['batch_size'] = cfg.batch_size args['shuffle'] = False args['csv_file'] = cfg.train_csv_file args['img_size'] = const.max_frame_size args['gen_hot_vector'] = True train_iter = img_generator_class(args) args['batch_size'] = cfg.batch_size args['csv_file'] = cfg.test_csv_file val_iter = img_generator_class(args) trn_images, trn_lbls = train_iter.imgs_and_lbls() val_imgs, val_lbls = val_iter.imgs_and_lbls() test_imgs, test_lbls = trn_images[:50], trn_lbls[:50] with tf.Graph().as_default(): if cfg.train_mode == 'semi_hard' or cfg.train_mode == 'hard' or cfg.train_mode == 'cntr': train_dataset = TripletTupleLoader(trn_images, trn_lbls, cfg).dataset #log_dataset = TripletTupleLoader(test_imgs,test_lbls,cfg).dataset elif cfg.train_mode == 'semi_hard_anchor' or cfg.train_mode == 'hard_anchor' or cfg.train_mode == 'cntr_anchor': train_dataset = TripletTupleLoaderAnchor(trn_images, trn_lbls, cfg).dataset elif cfg.train_mode == 'hard_anchor_fossils': train_dataset = TripletTupleLoaderAnchor(trn_images, trn_lbls, cfg).dataset elif cfg.train_mode == 'vanilla': train_dataset = QuickTupleLoader(trn_images, trn_lbls, cfg, is_training=True, shuffle=True, repeat=True).dataset else: raise NotImplementedError('{} is not a valid train mode'.format( cfg.train_mode)) val_dataset = QuickTupleLoader(val_imgs, val_lbls, cfg, is_training=False, repeat=False).dataset handle = tf.placeholder(tf.string, shape=[]) iterator = tf.data.Iterator.from_string_handle( handle, train_dataset.output_types, train_dataset.output_shapes) images_ph, lbls_ph = iterator.get_next() #batch_xs,batch_ys = training_iterator.get_next() network_class = locate(cfg.network_name) model = network_class(cfg, images_ph=images_ph, lbls_ph=lbls_ph) # Which loss fn to impose. For example, softmax only is applied in vanilla mode, # while softmax + semi-hard triplet is applied in semi_hard mode. if cfg.train_mode == 'semi_hard' or cfg.train_mode == 'semi_hard_anchor': pre_logits = model.train_pre_logits _, w, h, channels = pre_logits.shape embed_dim = cfg.emb_dim embedding_net = ConvEmbed(emb_dim=embed_dim, n_input=channels, n_h=h, n_w=w) embedding = embedding_net.forward(pre_logits) embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10) margin = cfg.margin gt_lbls = tf.argmax(model.gt_lbls, 1) metric_loss = triplet_semi.triplet_semihard_loss( gt_lbls, embedding, margin) logger.info('Triplet loss lambda {}, with margin {}'.format( cfg.triplet_loss_lambda, margin)) total_loss = model.train_loss + cfg.triplet_loss_lambda * tf.reduce_mean( metric_loss) elif cfg.train_mode == 'hard' or cfg.train_mode == 'hard_anchor': pre_logits = model.train_pre_logits _, w, h, channels = pre_logits.shape embed_dim = cfg.emb_dim embedding_net = ConvEmbed(emb_dim=embed_dim, n_input=channels, n_h=h, n_w=w) embedding = embedding_net.forward(pre_logits) embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10) margin = cfg.margin logger.info('Triplet loss lambda {}, with margin {}'.format( cfg.triplet_loss_lambda, margin)) gt_lbls = tf.argmax(model.gt_lbls, 1) metric_loss = triplet_hard.batch_hard(gt_lbls, embedding, margin) total_loss = model.train_loss + cfg.triplet_loss_lambda * tf.reduce_mean( metric_loss) elif cfg.train_mode == 'hard_fossils' or cfg.train_mode == 'hard_anchor_fossils': pre_logits = model.train_pre_logits _, w, h, channels = pre_logits.shape embed_dim = cfg.emb_dim embedding_net = ConvEmbed(emb_dim=embed_dim, n_input=channels, n_h=h, n_w=w) embedding = embedding_net.forward(pre_logits) embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10) margin = cfg.margin logger.info('Triplet loss lambda {}, with margin {}'.format( cfg.triplet_loss_lambda, margin)) gt_lbls = tf.argmax(model.gt_lbls, 1) metric_loss_far = triplet_hard.batch_hard_fossils( gt_lbls, embedding, margin) metric_loss = triplet_hard.batch_hard(gt_lbls, embedding, margin) total_loss = model.train_loss + 0.8 * cfg.triplet_loss_lambda * tf.reduce_mean( metric_loss) + 0.2 * cfg.triplet_loss_lambda * tf.reduce_mean( metric_loss_far) elif cfg.train_mode == 'cntr' or cfg.train_mode == 'cntr_anchor': pre_logits = model.train_pre_logits _, w, h, channels = pre_logits.shape embed_dim = cfg.emb_dim embedding_net = ConvEmbed(emb_dim=embed_dim, n_input=channels, n_h=h, n_w=w) embedding = embedding_net.forward(pre_logits) embedding = tf.nn.l2_normalize(embedding, dim=-1, epsilon=1e-10) CENTER_LOSS_LAMBDA = 0.003 CENTER_LOSS_ALPHA = 0.5 num_fg_classes = cfg.num_classes gt_lbls = tf.argmax(model.gt_lbls, 1) center_loss_order, centroids, centers_update_op, appear_times, diff = center_loss.get_center_loss( embedding, gt_lbls, CENTER_LOSS_ALPHA, num_fg_classes) # sample_centroid = tf.reshape(tf.gather(centroids, gt_lbls), [-1, config.emb_dim]) # center_loss_order = center_loss.center_loss(sample_centroid , embedding) logger.info('Center loss lambda {}'.format(CENTER_LOSS_LAMBDA)) total_loss = model.train_loss + CENTER_LOSS_LAMBDA * tf.reduce_mean( center_loss_order) elif cfg.train_mode == 'vanilla': total_loss = model.train_loss logger.info('Train Mode {}'.format(cfg.train_mode)) # variables_to_train = model.var_2_train(); # logger.info('variables_to_train ' + str(variables_to_train)) trainable_vars = tf.trainable_variables() if cfg.caffe_iter_size > 1: ## Accumulated Gradient ## Creation of a list of variables with the same shape as the trainable ones # initialized with 0s accum_vars = [ tf.Variable(tf.zeros_like(tv.initialized_value()), trainable=False) for tv in trainable_vars ] zero_ops = [tv.assign(tf.zeros_like(tv)) for tv in accum_vars] update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) if cfg.train_mode == const.Train_Mode.CNTR: update_ops.append(centers_update_op) # print(update_ops) with tf.control_dependencies(update_ops): global_step = tf.Variable(0, name='global_step', trainable=False) learning_rate = tf_utils.poly_lr(global_step, cfg) optimizer = tf.train.MomentumOptimizer(learning_rate, momentum=0.9) if cfg.caffe_iter_size > 1: ## Accumulated Gradient # grads = tf.Print(grads,[grads],'Grad Print'); grads = optimizer.compute_gradients(total_loss, trainable_vars) # Adds to each element from the list you initialized earlier with zeros its gradient (works because accum_vars and gvs are in the same order) accum_ops = [ accum_vars[i].assign_add(gv[0]) for i, gv in enumerate(grads) ] iter_size = cfg.caffe_iter_size # Define the training step (part with variable value update) train_op = optimizer.apply_gradients( [(accum_vars[i] / iter_size, gv[1]) for i, gv in enumerate(grads)], global_step=global_step) else: grads = optimizer.compute_gradients(total_loss) train_op = optimizer.apply_gradients(grads, global_step=global_step) sess = tf.InteractiveSession() training_iterator = train_dataset.make_one_shot_iterator() validation_iterator = val_dataset.make_initializable_iterator() training_handle = sess.run(training_iterator.string_handle()) validation_handle = sess.run(validation_iterator.string_handle()) tb_path = save_model_dir logger.info(tb_path) start_iter = tb_utils.get_latest_iteration(tb_path) train_writer = tf.summary.FileWriter(tb_path, sess.graph) tf.global_variables_initializer().run() saver = tf.train.Saver() # saves variables learned during training ckpt_file = tf.train.latest_checkpoint(save_model_dir) logger.info('Model Path {}'.format(ckpt_file)) load_model_msg = model.load_model(save_model_dir, ckpt_file, sess, saver, load_logits=False) logger.info(load_model_msg) ckpt_file = os.path.join(save_model_dir, cfg.checkpoint_filename) train_loss = tf.summary.scalar('Train_loss', model.train_loss) train_accuracy = tf.summary.scalar('Train_Acc', model.train_accuracy) val_loss = tf.summary.scalar('Val_Loss', model.val_loss) val_acc_op = tf.summary.scalar('Batch_Val_Acc', model.val_accuracy) model_acc_op = tf.summary.scalar('Split_Val_Accuracy', model.val_accumulated_accuracy) best_model_step = 0 best_acc = 0 logger.info('Start Training from {}, till {}'.format( start_iter, cfg.train_iters)) # Start Training for step in range(start_iter + 1, cfg.train_iters + 1): start_time_train = time.time() # Update network weights while supporting caffe_iter_size for mini_batch in range(cfg.caffe_iter_size - 1): feed_dict = {handle: training_handle} model_loss_value, accuracy_value, _ = sess.run( [model.train_loss, model.train_accuracy, accum_ops], feed_dict) feed_dict = {handle: training_handle} model_loss_value, accuracy_value, _ = sess.run( [model.train_loss, model.train_accuracy, train_op], feed_dict) if cfg.caffe_iter_size > 1: ## Accumulated Gradient sess.run(zero_ops) train_time = time.time() - start_time_train #training loss loss_summary = tf.Summary(value=[ tf.Summary.Value(tag="Train_loss", simple_value=model_loss_value) ]) acc_summary = tf.Summary(value=[ tf.Summary.Value(tag="Train_Acc", simple_value=accuracy_value) ]) train_writer.add_summary(loss_summary, step) train_writer.add_summary(acc_summary, step) if neptune_logger: neptune_logger.log_metric('Train_loss', model_loss_value) if cfg.training_mode_debug: logger.info( 'Training mode debug is ON, will save images every iteration.' ) batch_xs, batch_ys = training_iterator.get_next() summary_op = tf.summary.image('image-batch', batch_xs, max_outputs=10) summary = sess.run(summary_op) train_writer.add_summary(summary) if (step == 1 or step % cfg.logging_threshold == 0): logger.info( 'i {0:04d} loss {1:4f} Acc {2:2f} Batch Time {3:3f}'. format(step, model_loss_value, accuracy_value, train_time)) if (step % cfg.test_interval == 0): run_metadata = tf.RunMetadata() tf.local_variables_initializer().run() sess.run(validation_iterator.initializer) _val_acc_op = 0 gts = [] preds = [] pred_3 = [] pred_5 = [] while True: try: # Eval network on validation/testing split feed_dict = {handle: validation_handle} gt, preds_raw, predictions, val_loss_op, batch_accuracy, accuracy_op, _val_acc_op, _val_acc, c_cnf_mat, macro_acc = sess.run( [ model.val_gt, model.val_preds, model.val_class_prediction, val_loss, model.val_accuracy, model_acc_op, val_acc_op, model.val_accumulated_accuracy, model.val_confusion_mat, model.val_per_class_acc_acc ], feed_dict) gts += list(gt) preds += list(predictions) for g, p in zip(gt, preds_raw): preds_sort_3 = np.argsort(p)[-3:] preds_sort_5 = np.argsort(p)[-5:] if g in preds_sort_3: pred_3 += [g] else: pred_3 += [preds_sort_3[-1]] if g in preds_sort_5: pred_5 += [g] else: pred_5 += [preds_sort_5[-1]] except tf.errors.OutOfRangeError: logger.info('Val Acc {0}, Macro Acc: {1}'.format( _val_acc, macro_acc)) if neptune_logger: neptune_logger.log_metric( 'Validation Accuracy Macro', macro_acc) logger.info('____ Clasification Report Top 1 ____') report = classification_report(gts, preds, output_dict=True) if neptune_logger: neptune_logger.log_metric( 'Top 1 f-1', report['weighted avg']['f1-score']) neptune_logger.log_metric( 'Top 1 precision', report['weighted avg']['precision']) neptune_logger.log_metric( 'Top 1 recall', report['weighted avg']['recall']) csv_pd = classification_report_csv(report) csv_pd.to_csv( os.path.join( save_model_dir, 'Classification_Report_top1%04d.csv' % step)) logger.info(report) logger.info('____ Clasification Report Top 3 ____') report = classification_report(gts, pred_3, output_dict=True) if neptune_logger: neptune_logger.log_metric( 'Top 3 f-1', report['weighted avg']['f1-score']) neptune_logger.log_metric( 'Top 3 precision', report['weighted avg']['precision']) neptune_logger.log_metric( 'Top 3 recall', report['weighted avg']['recall']) csv_pd = classification_report_csv(report) csv_pd.to_csv( os.path.join( save_model_dir, 'Classification_Report_top3%04d.csv' % step)) logger.info(report) logger.info('____ Clasification Report Top 5 ____') report = classification_report(gts, pred_5, output_dict=True) if neptune_logger: neptune_logger.log_metric( 'Top 5 f-1', report['weighted avg']['f1-score']) neptune_logger.log_metric( 'Top 5 precision', report['weighted avg']['precision']) neptune_logger.log_metric( 'Top 5 recall', report['weighted avg']['recall']) csv_pd = classification_report_csv(report) csv_pd.to_csv( os.path.join( save_model_dir, 'Classification_Report_top5%04d.csv' % step)) logger.info(report) break #with train_writer.as_default(): batch_xs, batch_ys = training_iterator.get_next() summary_op = tf.summary.image('image-batch', batch_xs, max_outputs=10) summary = sess.run(summary_op) train_writer.add_summary(summary) train_writer.add_run_metadata(run_metadata, 'step%03d' % step) train_writer.add_summary(val_loss_op, step) train_writer.add_summary(_val_acc_op, step) train_writer.add_summary(accuracy_op, step) train_writer.flush() if (step % 100 == 0): #log_iterator = log_dataset.make_initializable_iterator() saver.save(sess, ckpt_file) if best_acc < _val_acc: saver.save(sess, ckpt_file + 'best') best_acc = _val_acc best_model_step = step logger.info('Best Acc {0} at {1} == {2}'.format( best_acc, best_model_step, model_basename)) logger.info('Triplet loss lambda {}'.format(cfg.triplet_loss_lambda)) logger.info('Mode {}'.format(cfg.train_mode)) logger.info('Loop complete') sess.close()
def train(): # Import data mnist = input_data.read_data_sets(FLAGS.data_dir, one_hot=True, fake_data=FLAGS.fake_data) sess = tf.InteractiveSession() # Create a multilayer model. # Input placeholders with tf.name_scope('input'): x = tf.placeholder(tf.float32, [None, 784], name='x-input') y_ = tf.placeholder(tf.float32, [None, 10], name='y-input') with tf.name_scope('input_reshape'): image_shaped_input = tf.reshape(x, [-1, 28, 28, 1]) tf.summary.image('input', image_shaped_input, 10) # We can't initialize these variables to 0 - the network will get stuck. def weight_variable(shape): """Create a weight variable with appropriate initialization.""" initial = tf.truncated_normal(shape, stddev=0.1) return tf.Variable(initial) def bias_variable(shape): """Create a bias variable with appropriate initialization.""" initial = tf.constant(0.1, shape=shape) return tf.Variable(initial) def variable_summaries(var): """Attach a lot of summaries to a Tensor (for TensorBoard visualization).""" with tf.name_scope('summaries'): mean = tf.reduce_mean(var) tf.summary.scalar('mean', mean) with tf.name_scope('stddev'): stddev = tf.sqrt(tf.reduce_mean(tf.square(var - mean))) tf.summary.scalar('stddev', stddev) tf.summary.scalar('max', tf.reduce_max(var)) tf.summary.scalar('min', tf.reduce_min(var)) tf.summary.histogram('histogram', var) def nn_layer(input_tensor, input_dim, output_dim, layer_name, act=tf.nn.relu): """Reusable code for making a simple neural net layer. It does a matrix multiply, bias add, and then uses ReLU to nonlinearize. It also sets up name scoping so that the resultant graph is easy to read, and adds a number of summary ops. """ # Adding a name scope ensures logical grouping of the layers in the graph. with tf.name_scope(layer_name): # This Variable will hold the state of the weights for the layer with tf.name_scope('weights'): weights = weight_variable([input_dim, output_dim]) variable_summaries(weights) with tf.name_scope('biases'): biases = bias_variable([output_dim]) variable_summaries(biases) with tf.name_scope('Wx_plus_b'): preactivate = tf.matmul(input_tensor, weights) + biases tf.summary.histogram('pre_activations', preactivate) activations = act(preactivate, name='activation') tf.summary.histogram('activations', activations) return activations hidden1 = nn_layer(x, 784, 500, 'layer1') with tf.name_scope('dropout'): keep_prob = tf.placeholder(tf.float32) tf.summary.scalar('dropout_keep_probability', keep_prob) dropped = tf.nn.dropout(hidden1, keep_prob) # Do not apply softmax activation yet, see below. y = nn_layer(dropped, 500, 10, 'layer2', act=tf.identity) with tf.name_scope('cross_entropy'): # The raw formulation of cross-entropy, # # tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(tf.softmax(y)), # reduction_indices=[1])) # # can be numerically unstable. # # So here we use tf.nn.softmax_cross_entropy_with_logits on the # raw outputs of the nn_layer above, and then average across # the batch. diff = tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y) with tf.name_scope('total'): cross_entropy = tf.reduce_mean(diff) tf.summary.scalar('cross_entropy', cross_entropy) with tf.name_scope('train'): train_step = tf.train.AdamOptimizer(FLAGS.learning_rate).minimize( cross_entropy) with tf.name_scope('accuracy'): with tf.name_scope('correct_prediction'): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) with tf.name_scope('accuracy'): accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32)) tf.summary.scalar('accuracy', accuracy) # Merge all the summaries and write them out to # /tmp/tensorflow/mnist/logs/mnist_with_summaries (by default) merged = tf.summary.merge_all() train_writer = tf.summary.FileWriter(FLAGS.log_dir + '/train', sess.graph) test_writer = tf.summary.FileWriter(FLAGS.log_dir + '/test') tf.global_variables_initializer().run() # Train the model, and also write summaries. # Every 10th step, measure test-set accuracy, and write test summaries # All other steps, run train_step on training data, & add training summaries def feed_dict(train): """Make a TensorFlow feed_dict: maps data onto Tensor placeholders.""" if train or FLAGS.fake_data: xs, ys = mnist.train.next_batch(100, fake_data=FLAGS.fake_data) k = FLAGS.dropout else: xs, ys = mnist.test.images, mnist.test.labels k = 1.0 return {x: xs, y_: ys, keep_prob: k} for i in range(FLAGS.max_steps): if i % 10 == 0: # Record summaries and test-set accuracy summary, acc = sess.run([merged, accuracy], feed_dict=feed_dict(False)) test_writer.add_summary(summary, i) print('Accuracy at step %s: %s' % (i, acc)) else: # Record train set summaries, and train if i % 100 == 99: # Record execution stats run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True), options=run_options, run_metadata=run_metadata) train_writer.add_run_metadata(run_metadata, 'step%03d' % i) train_writer.add_summary(summary, i) print('Adding run metadata for', i) else: # Record a summary summary, _ = sess.run([merged, train_step], feed_dict=feed_dict(True)) train_writer.add_summary(summary, i) train_writer.close() test_writer.close()
def main(m_type, m_name, logger, save_videos=False): """ run an evaluation on the Test datasets: ExCuSe, ElSe, PupilNet, Swirski, LPW :param m_type: need model type: inception, yolo, gap,... :param m_name: name of the model ( model folder name: 3A4Bh-Ref25) :param logger: need logger to log the events :return: show the results in terminal """ run_meta = tf.RunMetadata() with tf.Session() as sess: # load best model model = load_model(sess, m_type, m_name, logger) # calculate the FLOPS opts_f = tf.profiler.ProfileOptionBuilder.float_operation() flops = tf.profiler.profile(run_meta=run_meta, cmd='op', options=opts_f) opts_p = tf.profiler.ProfileOptionBuilder.trainable_variables_parameter( ) params = tf.profiler.profile(sess.graph, run_meta=run_meta, cmd='op', options=opts_p) if flops is not None: print('TF stats gives', flops.total_float_ops) if params is not None: print('TF stats gives', params.total_parameters) # print the result for different pixel error pixel_errors = [1, 2, 3, 4, 5, 7, 10, 15, 20] # get the csv files datasets = glob.glob('data/emma_data/*.txt') datasets = sorted(datasets) # we save the results of all dataset in to this list dataset_results = {} for d in datasets: # get the name of dataset from the path dataset_name = d.split("/")[2].split(".")[0] # save the result (differences) in the list dataset_results[dataset_name] = [] dataset_len = get_len(d) batch_size = 2 * config["batch_size"] batch = read_batch(d, batch_size, dataset_name) # use tqdm progress bar tqdm_len = np.ceil(dataset_len / batch_size) with tqdm(total=tqdm_len, unit='batch') as t: # set the name of dataset as the title of progress bar t.set_description_str(dataset_name) test_images = [] pred_labels = [] # loop over batch of images for images, truths, shapes, pngs in batch: predictions = model.predict(sess, images) upscale_preds_x, upscale_preds_y, w = upscale_preds( predictions, shapes) # calculate the difference a = upscale_preds_x - truths[:, 0] b = upscale_preds_y - truths[:, 1] diff = np.sqrt((a * a + b * b)) dataset_results[dataset_name].extend(diff) t.update() # add images and predicted labels to test_images and pred_labels to creating the video len_data = len(upscale_preds_x) upscale_preds_x = np.reshape(upscale_preds_x, newshape=(len_data, 1)) upscale_preds_y = np.reshape(upscale_preds_y, newshape=(len_data, 1)) w = np.reshape(w, newshape=(len_data, 1)) upscale_center = np.concatenate( (upscale_preds_x, upscale_preds_y, w), axis=1) test_images.extend(pngs) pred_labels.extend(upscale_center) # create the predicted labels on test sets if save_videos: video_creator(dataset_name, test_images, pred_labels) # save the results in a dic dataset_errors = {} for key, val in dataset_results.items(): dataset_errors[key] = [] for e in pixel_errors: d = np.asarray(val, dtype=np.float32) acc = np.mean(np.asarray(d < e, dtype=np.int)) dataset_errors[key].append(acc) print_resutls(dataset_errors, pixel_errors, dataset_names) return print("####### LPW #######") # run model on LPW dataset lpw_results = {} lpw_r = lpw_reader(batch_size=2 * config["batch_size"], normalize_image=True) for imgs, truths, d_name, shapes in lpw_r: # add dataset name to results dict if d_name not in lpw_results.keys(): lpw_results[d_name] = [] predictions = model.predict(sess, imgs) upscale_preds_x, upscale_preds_y, w = upscale_preds( predictions, shapes) # calculate the difference a = upscale_preds_x - truths[:, 0] b = upscale_preds_y - truths[:, 1] diff = np.sqrt((a * a + b * b)) lpw_results[d_name].extend(diff) lpw_errors = {} for key, val in lpw_results.items(): lpw_errors[key] = [] for e in pixel_errors: d = np.asarray(val, dtype=np.float32) acc = np.mean(np.asarray(d < e, dtype=np.int)) lpw_errors[key].append(acc) print_resutls(lpw_errors, pixel_errors) print("####### SWIRSKI #######") # run model on LPW dataset swk_results = {} swk_r = swirski_reader(batch_size=2 * config["batch_size"]) for imgs, truths, d_name, shapes in swk_r: # add dataset name to results dict if d_name not in swk_results.keys(): swk_results[d_name] = [] predictions = model.predict(sess, imgs) upscale_preds_x, upscale_preds_y, w = upscale_preds( predictions, shapes) # calculate the difference a = upscale_preds_x - truths[:, 0] b = upscale_preds_y - truths[:, 1] diff = np.sqrt((a * a + b * b)) swk_results[d_name].extend(diff) swk_errors = {} for key, val in swk_results.items(): swk_errors[key] = [] for e in pixel_errors: d = np.asarray(val, dtype=np.float32) acc = np.mean(np.asarray(d < e, dtype=np.int)) swk_errors[key].append(acc) print_resutls(swk_errors, pixel_errors)
def minimize(scope, log_norm, energy, steps, check_obs={}, \ check_every=100, save_every=1000, lr=0.001, restore_path=None, output_path=None, profiling=False): """Trains the wavefunction to minimize the energy. Arguments: scope (string): the variable scope where all variables to be trained reside log_norm (tensor): a batch of log wavefunction norm of samples, shape (batch_shape,) energy (tensor): energies of the same batch of samples, shape (batch_shape,) steps (int): the total number of training steps check_obs (dict from strings to tensors): names and values of the tensors to log check_every (int): the number of steps between logging save_every (int): the number of steps between saving the model lr (float): learning rate for the optimizer restore_path (string or None): the directory (ending with "/") to restore the model If None, start from scratch by default. output_path (string or None): the directory to save the trained model If None, save to "results/" + scope + "/" by default. profiling (bool): whether to profile the code Returns: obs (dict from strings to scalars or lists of scalars): statistics of the tensors in check_obs For each name in check_obs, obs contains an entry of the same name for the average value of the observable, and an entry of name_std for its standard deviation, with an entry of name_raw for its history of values during the training. """ # set up training ops mean_energy = tf.reduce_mean(energy) loss = energy + 2 * log_norm * tf.stop_gradient(energy - mean_energy) loss = tf.reduce_mean(loss) variables = tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope) # print(variables) print("Total number of parameters:", sum(tf.Session().run(tf.size(v)) for v in variables)) optimizer = tf.train.AdamOptimizer(learning_rate=lr) grads_and_vars = optimizer.compute_gradients(loss, variables) grads_and_vars = [(zero_nan(tf.clip_by_norm(grad, 1.0)), var) for grad, var in grads_and_vars if grad is not None] train_op = optimizer.apply_gradients(grads_and_vars) # set up loggings output_path = output_path or "results/" + scope + "/" model_path = output_path + "model.ckpt" saver = tf.train.Saver(variables) file_writer = tf.summary.FileWriter(output_path) check_obs = { key: tf.reduce_mean(value) for key, value in check_obs.items() } check_obs["energy"] = mean_energy for key, value in check_obs.items(): tf.summary.scalar(key, value) summary = tf.summary.merge_all() res = {key: [] for key in check_obs} # initialize variables config = tf.ConfigProto() config.gpu_options.allow_growth = False sess = tf.Session(config=config) sess.run(tf.variables_initializer(optimizer.variables())) if restore_path is None or not restore(sess, saver, restore_path + "model.ckpt", True): sess.run(tf.variables_initializer(variables)) print("Starting from scratch ...") progbar = tf.keras.utils.Progbar(steps, stateful_metrics=list(check_obs.keys())) if profiling: options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() # training for step in range(1, steps + 1): if profiling and step % save_every == 0: e, _ = sess.run([mean_energy, train_op], options=options, run_metadata=run_metadata) else: e, _ = sess.run([mean_energy, train_op]) # log if step % check_every == 0: obs = [(key, sess.run(value)) for key, value in check_obs.items()] for key, value in obs: res[key].append(value) progbar.update(step, obs) file_writer.add_summary(sess.run(summary), step) else: progbar.update(step, [("energy", e)]) # always update energy # save the model if step % save_every == 0: saver.save(sess, model_path) print(" Model saved to", model_path) if profiling: fetched_timeline = timeline.Timeline(run_metadata.step_stats) chrome_trace = fetched_timeline.generate_chrome_trace_format() with open( output_path + 'timeline_%d.json' % (step // save_every), 'w') as f: f.write(chrome_trace) # return the observables raw = {key + "_raw": res[key] for key in res} mean = {key: np.mean(truncate(res[key])) for key in res} std = {key + "_std": np.std(truncate(res[key])) for key in res} return {**raw, **mean, **std}
def timeGraph(gdef, batch_size=128, image_folder='images', nvidiasmi='output.out', latencyF='latency.txt', StopTime=100): tf.logging.info("Starting execution") gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.95) tf.reset_default_graph() g = tf.Graph() ## if dummy_input is None: ## dummy_input = np.random.random_sample((batch_size,224,224,3)) imageCounter = 0 outlist = [] with g.as_default(): imagenstack = tf.constant([""]) imageString = [] for imageName in sorted(glob.glob(image_folder + '/*.JPEG')): imageString.append(imageName) imageCounter = imageCounter + 1 imagenstack = tf.stack(imageString) dataset = tf.data.Dataset.from_tensor_slices(imagenstack) dataset = dataset.map(_parse_function) dataset = dataset.batch(batch_size) dataset = dataset.repeat() iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() out = tf.import_graph_def(graph_def=gdef, input_map={"input": next_element}, return_elements=["final_layer/predictions"]) out = out[0].outputs[0] print("\n\n image out", out, "\n\n") outlist.append(out) print("\n\n image out", outlist[-1], "\n\n") timings = [] with tf.Session(graph=g, config=tf.ConfigProto(gpu_options=gpu_options)) as sess: run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() num_iters = int(math.ceil(imageCounter / batch_size)) print("\n\n\nNumber of Iterations = ", num_iters) nvidiasmiCommand = "nohup nvidia-smi --query-gpu=power.draw --format=csv,noheader,nounits -l 1 -f " + nvidiasmi + " &" nmonCommand = "nmon -s1 -c 2000 -F " + nvidiasmi + ".nmon &" pmonCommand = "nohup nvidia-smi pmon -f " + nvidiasmi + ".pmon &" os.system(nvidiasmiCommand) #os.system(nmonCommand) #os.system(pmonCommand) tstart = time.time() if os.path.exists(latencyF): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not runtimeResults = open(latencyF, append_write) start_process = time.time() for k in range(num_iters): tic = time.time() val = sess.run(outlist) tac = time.time() runtimeResults.write(str(tac - tic)) runtimeResults.write("\n") if ((tac - start_process) > StopTime): break #printing lables printLables = 0 if printLables == 1: if os.path.exists('resultLables_PNASNet_5_Large_331.txt'): append_write = 'a' # append if already exists else: append_write = 'w' # make a new file if not # highscore = open('resultLables_PNASNet_5_Large_331.txt', append_write) for index1 in range(0, len(topX(val[0], f.topN)[1])): highscore.write( str(getLabels(labels, topX(val[0], f.topN)[1][index1]))) highscore.write("\n") highscore.close() #end for prinlables timings.append(time.time() - tstart) runtimeResults.close() # if os.path.exists('runtimes_PNASNet_5_Large_331.txt'): # append_write = 'a' # append if already exists # else: # append_write = 'w' # make a new file if not # # runtimeResults = open('runtimes_PNASNet_5_Large_331.txt',append_write) # runtimeResults.write(str(batch_size) + ',' + str(timings[-1])) # runtimeResults.write("\n") # runtimeResults.close() os.system("pkill nvidia-smi") #os.system("pkill nmon") sess.close() tf.logging.info("Timing loop done!") return timings, True, val[0], None
def CIFAR10_train(): # 将处理输入数据的计算都放在名字为'input'的命名空间下 with tf.name_scope('input'): # 读取数据 images_train, lables_train = CIFAR10_input.distorted_inputs( data_dir=data_dir, batch_size=FLAGS.BATCH_SIZE) images_test, lables_test = CIFAR10_input.inputs( eval_data=True, data_dir=data_dir, batch_size=FLAGS.BATCH_SIZE) # 定义输入输出placeholder x = tf.placeholder(tf.float32, [ None, CIFAR10_inference.IMAGE_SIZE, CIFAR10_inference.IMAGE_SIZE, CIFAR10_inference.NUM_CHANNELS ], name='x-input') y_ = tf.placeholder(tf.float32, [None, CIFAR10_inference.OUTPUT_NODE], name='y-input') # 使用LeNet5_inference定义的前向传播 y = CIFAR10_inference.inference(x, True, 'L2') global_step = tf.Variable(0, trainable=False) # 将处理滑动平均相关的计算都放在一个命名空间下 with tf.name_scope('moving_average'): # 定义滑动平均操作 variable_average = tf.train.ExponentialMovingAverage( FLAGS.MOVING_AVERAGE_DECAY, global_step) variables_average_op = variable_average.apply(tf.trainable_variables()) # 将计算损失函数相关的计算都放在一个命名空间下 with tf.name_scope('loss_function'): cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=y, labels=tf.argmax(y_, 1)) cross_entropy_mean = tf.reduce_mean(cross_entropy) tf.add_to_collection('losses', cross_entropy_mean) loss = tf.add_n(tf.get_collection('losses')) tf.summary.scalar('loss_function', loss) # 将定义学习率、优化方法以及每一轮训练需要执行的操作放在一个命名空间 with tf.name_scope('train_step'): learning_rate = tf.train.exponential_decay(FLAGS.LEARNING_RATE_BASE, global_step, 50000 / FLAGS.BATCH_SIZE, FLAGS.LEARNING_RATE_DECAY, staircase=True) tf.summary.scalar('learning_rate', learning_rate) train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize( loss, global_step) # 顺序执行 with tf.control_dependencies([train_step, variables_average_op]): train_op = tf.no_op(name='train') with tf.name_scope('accuracy'): correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1)) accuracy_train = tf.reduce_mean(tf.cast(correct_prediction, 'float')) tf.summary.scalar('accuracy_train', accuracy_train) accuracy_test = tf.reduce_mean(tf.cast(correct_prediction, 'float')) # tf.summary.scalar('accuracy_test', accuracy_test) # 初始化Tensorflow持久化类 saver = tf.train.Saver() with tf.Session() as sess: # sess = tfdbg.LocalCLIDebugWrapperSession(sess, ui_type="readline") # 被调试器封装的会话 tf.global_variables_initializer().run() coord = tf.train.Coordinator() threads = tf.train.start_queue_runners(sess=sess, coord=coord) # 合并日志 merged = tf.summary.merge_all() writer = tf.summary.FileWriter('../log_train', tf.get_default_graph()) xs_test, ys_test = sess.run([images_test, lables_test]) # 对标签进行onehot编码 ys_test_onehot = np.eye(10, dtype=float)[ys_test] # 在训练过程中不再测试模型在验证数据上的表现,验证和测试的过程会有一个独立的程序来完成 for i in range(FLAGS.TRAINING_STEPS): xs, ys = sess.run([images_train, lables_train]) # 对标签进行onehot编码 ys_onehot = np.eye(10, dtype=float)[ys] # 每1000轮保存一次模型 if i % 1000 == 0: # 配置运行时需要记录的信息 run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) # 运行时记录运行信息的proto run_metadata = tf.RunMetadata() # 将配置信息和记录运行信息的proto传入运行的过程,从而记录运行时每一个节点的时间、空间开销信息 _, loss_value, step, result = sess.run( [train_op, loss, global_step, merged], feed_dict={ x: xs, y_: ys_onehot }, options=run_options, run_metadata=run_metadata) # 将节点在运行时的信息写入日志文件 writer.add_run_metadata(run_metadata, 'step%03d' % i) writer.add_summary(result, i) # 输出当前的训练情况。这里只输出了模型在当前训练batch上的损失函数大小。通过损失函数的大小可以大概了解 # 训练的情况。在验证集上的正确率信息会有一个单独的程序来 生成。 train_accuracy = accuracy_train.eval(feed_dict={ x: xs, y_: ys_onehot }) test_accuracy = accuracy_test.eval(feed_dict={ x: xs_test, y_: ys_test_onehot }) print( '%s:After %d training steps, loss = %g, accuracy = %g, validation accuracy=%g' % (datetime.now(), i, loss_value, train_accuracy, test_accuracy)) # 保存当前的模型。这里给出了global_step参数,这样可以让每个被保存的文件名末尾加上训练的轮数,比如 # 'model.ckpt-1000'表示训练1000轮之后得到的模型 saver.save(sess, os.path.join(MODEL_SAVE_PATH, MODEL_NAME), global_step=global_step) else: _, loss_value, step = sess.run([train_op, loss, global_step], feed_dict={ x: xs, y_: ys_onehot }) coord.request_stop() coord.join(threads) writer.close()
def test_embedding_lookup_sparse(): batch = 256 #FLAGS.batch nzdim = 100 #FLAGS.nonzero_dim layers = "1000000,30" weight_shape = [int(d) for d in layers.split(",")] inputs = gen_sparse_inputs(batch, weight_shape[0], nzdim) batch_ids = gen_sparse_indices(batch, nzdim) embedding_op = sparse_transform( tf.SparseTensor(indices=batch_ids, values=inputs["ids"], dense_shape=[batch, nzdim]), tf.SparseTensor(indices=batch_ids, values=inputs["values"], dense_shape=[batch, nzdim]), weight_shape) init_op = tf.global_variables_initializer() run_options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() graph_options = tf.GraphOptions(enable_bfloat16_sendrecv=False) sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=True, graph_options=graph_options) sess = tf.Session(config=sess_config) sess.run(init_op) step = 0 max_steps = 1000 while step < max_steps: #FLAGS.max_steps: sess.run([embedding_op], options=run_options, run_metadata=run_metadata) step += 1 ProfileOptionBuilder = tf.profiler.ProfileOptionBuilder opts = ProfileOptionBuilder( ProfileOptionBuilder.time_and_memory()).with_node_names( show_name_regexes=['.*train.py.*']).build() tf.profiler.profile(tf.get_default_graph(), run_meta=run_metadata, cmd='code', options=opts) # Print to stdout an analysis of the memory usage and the timing information # broken down by operation types. tf.profiler.profile( tf.get_default_graph(), run_meta=run_metadata, cmd='op', options=tf.profiler.ProfileOptionBuilder.time_and_memory()) tf.contrib.tfprof.model_analyzer.print_model_analysis( tf.get_default_graph(), run_meta=run_metadata, tfprof_options=tf.contrib.tfprof.model_analyzer.PRINT_ALL_TIMING_MEMORY )
def learn(self, total_timesteps, callback=None, seed=None, log_interval=100, tb_log_name="TRPO", reset_num_timesteps=True): new_tb_log = self._init_num_timesteps(reset_num_timesteps) with SetVerbosity(self.verbose), TensorboardWriter(self.graph, self.tensorboard_log, tb_log_name, new_tb_log) \ as writer: self._setup_learn(seed) with self.sess.as_default(): seg_gen = traj_segment_generator( self.policy_pi, self.env, self.timesteps_per_batch, reward_giver=self.reward_giver, gail=self.using_gail) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 t_start = time.time() len_buffer = deque( maxlen=40) # rolling buffer for episode lengths reward_buffer = deque( maxlen=40) # rolling buffer for episode rewards self.episode_reward = np.zeros((self.n_envs, )) true_reward_buffer = None if self.using_gail: true_reward_buffer = deque(maxlen=40) # Initialize dataloader batchsize = self.timesteps_per_batch // self.d_step self.expert_dataset.init_dataloader(batchsize) # Stats not used for now # TODO: replace with normal tb logging # g_loss_stats = Stats(loss_names) # d_loss_stats = Stats(reward_giver.loss_name) # ep_stats = Stats(["True_rewards", "Rewards", "Episode_length"]) while True: if callback is not None: # Only stop training if return value is False, not when it is None. This is for backwards # compatibility with callbacks that have no return statement. if callback(locals(), globals()) is False: break if total_timesteps and timesteps_so_far >= total_timesteps: break logger.log("********** Iteration %i ************" % iters_so_far) def fisher_vector_product(vec): return self.allmean( self.compute_fvp( vec, *fvpargs, sess=self.sess)) + self.cg_damping * vec # ------------------ Update G ------------------ logger.log("Optimizing Policy...") # g_step = 1 when not using GAIL mean_losses = None vpredbefore = None tdlamret = None observation = None action = None seg = None for k in range(self.g_step): with self.timed("sampling"): seg = seg_gen.__next__() add_vtarg_and_adv(seg, self.gamma, self.lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) observation, action, atarg, tdlamret = seg["ob"], seg[ "ac"], seg["adv"], seg["tdlamret"] vpredbefore = seg[ "vpred"] # predicted value function before update atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate # true_rew is the reward without discount if writer is not None: self.episode_reward = total_episode_reward_logger( self.episode_reward, seg["true_rew"].reshape( (self.n_envs, -1)), seg["dones"].reshape( (self.n_envs, -1)), writer, self.num_timesteps) args = seg["ob"], seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] self.assign_old_eq_new(sess=self.sess) with self.timed("computegrad"): steps = self.num_timesteps + (k + 1) * ( seg["total_timestep"] / self.g_step) run_options = tf.RunOptions( trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata( ) if self.full_tensorboard_log else None # run loss backprop with summary, and save the metadata (memory, compute time, ...) if writer is not None: summary, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) if self.full_tensorboard_log: writer.add_run_metadata( run_metadata, 'step%d' % steps) writer.add_summary(summary, steps) else: _, grad, *lossbefore = self.compute_lossandgrad( *args, tdlamret, sess=self.sess, options=run_options, run_metadata=run_metadata) lossbefore = self.allmean(np.array(lossbefore)) grad = self.allmean(grad) if np.allclose(grad, 0): logger.log("Got zero gradient. not updating") else: with self.timed("conjugate_gradient"): stepdir = conjugate_gradient( fisher_vector_product, grad, cg_iters=self.cg_iters, verbose=self.rank == 0 and self.verbose >= 1) assert np.isfinite(stepdir).all() shs = .5 * stepdir.dot( fisher_vector_product(stepdir)) # abs(shs) to avoid taking square root of negative values lagrange_multiplier = np.sqrt( abs(shs) / self.max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lagrange_multiplier expectedimprove = grad.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = self.get_flat() thnew = None for _ in range(10): thnew = thbefore + fullstep * stepsize self.set_from_flat(thnew) mean_losses = surr, kl_loss, *_ = self.allmean( np.array( self.compute_losses(*args, sess=self.sess))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(mean_losses).all(): logger.log( "Got non-finite value of losses -- bad!" ) elif kl_loss > self.max_kl * 1.5: logger.log( "violated KL constraint. shrinking step." ) elif improve < 0: logger.log( "surrogate didn't improve. shrinking step." ) else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") self.set_from_flat(thbefore) if self.nworkers > 1 and iters_so_far % 20 == 0: # list of tuples paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), self.vfadam.getflat().sum())) assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) with self.timed("vf"): for _ in range(self.vf_iters): # NOTE: for recurrent policies, use shuffle=False? for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=128, shuffle=True): grad = self.allmean( self.compute_vflossandgrad( mbob, mbob, mbret, sess=self.sess)) self.vfadam.update(grad, self.vf_stepsize) for (loss_name, loss_val) in zip(self.loss_names, mean_losses): logger.record_tabular(loss_name, loss_val) logger.record_tabular( "explained_variance_tdlam_before", explained_variance(vpredbefore, tdlamret)) if self.using_gail: # ------------------ Update D ------------------ logger.log("Optimizing Discriminator...") logger.log(fmt_row(13, self.reward_giver.loss_name)) assert len(observation) == self.timesteps_per_batch batch_size = self.timesteps_per_batch // self.d_step # NOTE: uses only the last g step for observation d_losses = [ ] # list of tuples, each of which gives the loss for a minibatch # NOTE: for recurrent policies, use shuffle=False? for ob_batch, ac_batch in dataset.iterbatches( (observation, action), include_final_partial_batch=False, batch_size=batch_size, shuffle=True): ob_expert, ac_expert = self.expert_dataset.get_next_batch( ) # update running mean/std for reward_giver if self.reward_giver.normalize: self.reward_giver.obs_rms.update( np.concatenate((ob_batch, ob_expert), 0)) # Reshape actions if needed when using discrete actions if isinstance(self.action_space, gym.spaces.Discrete): if len(ac_batch.shape) == 2: ac_batch = ac_batch[:, 0] if len(ac_expert.shape) == 2: ac_expert = ac_expert[:, 0] *newlosses, grad = self.reward_giver.lossandgrad( ob_batch, ac_batch, ob_expert, ac_expert) self.d_adam.update(self.allmean(grad), self.d_stepsize) d_losses.append(newlosses) logger.log(fmt_row(13, np.mean(d_losses, axis=0))) # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"], seg["ep_true_rets"]) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews, true_rets = map(flatten_lists, zip(*list_lr_pairs)) true_reward_buffer.extend(true_rets) else: # lr: lengths and rewards lr_local = (seg["ep_lens"], seg["ep_rets"] ) # local values list_lr_pairs = MPI.COMM_WORLD.allgather( lr_local) # list of tuples lens, rews = map(flatten_lists, zip(*list_lr_pairs)) len_buffer.extend(lens) reward_buffer.extend(rews) if len(len_buffer) > 0: logger.record_tabular("EpLenMean", np.mean(len_buffer)) logger.record_tabular("EpRewMean", np.mean(reward_buffer)) if self.using_gail: logger.record_tabular("EpTrueRewMean", np.mean(true_reward_buffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) current_it_timesteps = MPI.COMM_WORLD.allreduce( seg["total_timestep"]) timesteps_so_far += current_it_timesteps self.num_timesteps += current_it_timesteps iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", self.num_timesteps) logger.record_tabular("TimeElapsed", time.time() - t_start) if self.verbose >= 1 and self.rank == 0: logger.dump_tabular() return self
def main(): #总的几时开始 mainStart = time.clock() #add by Tony step1Start = time.time() if a.seed is None: a.seed = random.randint(0, 2**31 - 1) tf.set_random_seed(a.seed) np.random.seed(a.seed) random.seed(a.seed) if not os.path.exists(a.output_dir): os.makedirs(a.output_dir) if a.mode == "test" or a.mode == "export": if a.checkpoint is None: raise Exception("checkpoint required for test mode") # load some options from the checkpoint #从checkpoint取一些选项 options = {"which_direction", "ngf", "ndf", "lab_colorization"} with open(os.path.join(a.checkpoint, "options.json")) as f: for key, val in json.loads(f.read()).items(): if key in options: print("loaded", key, "=", val) setattr(a, key, val) # disable these features in test mode a.scale_size = CROP_SIZE a.flip = False for k, v in a._get_kwargs(): print(k, "=", v) with open(os.path.join(a.output_dir, "options.json"), "w") as f: f.write(json.dumps(vars(a), sort_keys=True, indent=4)) #add by Tony step1Stop = time.time() if a.mode == "export": # export the generator to a meta graph that can be imported later for standalone generation if a.lab_colorization: raise Exception("export not supported for lab_colorization") input = tf.placeholder(tf.string, shape=[1]) input_data = tf.decode_base64(input[0]) input_image = tf.image.decode_png(input_data) # remove alpha channel if present input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 4), lambda: input_image[:,:,:3], lambda: input_image) # convert grayscale to RGB input_image = tf.cond(tf.equal(tf.shape(input_image)[2], 1), lambda: tf.image.grayscale_to_rgb(input_image), lambda: input_image) input_image = tf.image.convert_image_dtype(input_image, dtype=tf.float32) input_image.set_shape([CROP_SIZE, CROP_SIZE, 3]) batch_input = tf.expand_dims(input_image, axis=0) with tf.variable_scope("generator"): batch_output = deprocess(create_generator(preprocess(batch_input), 3)) output_image = tf.image.convert_image_dtype(batch_output, dtype=tf.uint8)[0] if a.output_filetype == "png": output_data = tf.image.encode_png(output_image) elif a.output_filetype == "jpeg": output_data = tf.image.encode_jpeg(output_image, quality=80) else: raise Exception("invalid filetype") output = tf.convert_to_tensor([tf.encode_base64(output_data)]) key = tf.placeholder(tf.string, shape=[1]) inputs = { "key": key.name, "input": input.name } tf.add_to_collection("inputs", json.dumps(inputs)) outputs = { "key": tf.identity(key).name, "output": output.name, } tf.add_to_collection("outputs", json.dumps(outputs)) init_op = tf.global_variables_initializer() #创建一个Saver对象 restore_saver = tf.train.Saver() #创建一个Saver对象 export_saver = tf.train.Saver() with tf.Session() as sess: sess.run(init_op) print("loading model from checkpoint") checkpoint = tf.train.latest_checkpoint(a.checkpoint) restore_saver.restore(sess, checkpoint) print("exporting model") export_saver.export_meta_graph(filename=os.path.join(a.output_dir, "export.meta")) export_saver.save(sess, os.path.join(a.output_dir, "export"), write_meta_graph=False) return #add by Tony loadExamplesCreateModelStart = time.time() #delete all files of folder #test之前先删除掉facades/val和facades_test目录下的所有文件 def del_file(path): ls = os.listdir(path) for i in ls: c_path = os.path.join(path, i) if os.path.isdir(c_path): del_file(c_path) else: os.remove(c_path) del_file("facades/val") del_file("facades_test") examples = load_examples() print("examples count = %d" % examples.count) # inputs and targets are [batch_size, height, width, channels] model = create_model(examples.inputs, examples.targets) # undo colorization splitting on images that we use for display/output if a.lab_colorization: if a.which_direction == "AtoB": # inputs is brightness, this will be handled fine as a grayscale image # need to augment targets and outputs with brightness targets = augment(examples.targets, examples.inputs) outputs = augment(model.outputs, examples.inputs) # inputs can be deprocessed normally and handled as if they are single channel # grayscale images inputs = deprocess(examples.inputs) elif a.which_direction == "BtoA": # inputs will be color channels only, get brightness from targets inputs = augment(examples.inputs, examples.targets) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) else: raise Exception("invalid direction") else: inputs = deprocess(examples.inputs) targets = deprocess(examples.targets) outputs = deprocess(model.outputs) def convert(image): if a.aspect_ratio != 1.0: # upscale to correct aspect ratio size = [CROP_SIZE, int(round(CROP_SIZE * a.aspect_ratio))] image = tf.image.resize_images(image, size=size, method=tf.image.ResizeMethod.BICUBIC) #改变图像数据的类型 return tf.image.convert_image_dtype(image, dtype=tf.uint8, saturate=True) # reverse any processing on images so they can be written to disk or displayed to user with tf.name_scope("convert_inputs"): converted_inputs = convert(inputs) with tf.name_scope("convert_targets"): converted_targets = convert(targets) with tf.name_scope("convert_outputs"): converted_outputs = convert(outputs) with tf.name_scope("encode_images"): display_fetches = { "paths": examples.paths, "inputs": tf.map_fn(tf.image.encode_png, converted_inputs, dtype=tf.string, name="input_pngs"), "targets": tf.map_fn(tf.image.encode_png, converted_targets, dtype=tf.string, name="target_pngs"), "outputs": tf.map_fn(tf.image.encode_png, converted_outputs, dtype=tf.string, name="output_pngs"), } # summaries with tf.name_scope("inputs_summary"): tf.summary.image("inputs", converted_inputs) with tf.name_scope("targets_summary"): tf.summary.image("targets", converted_targets) with tf.name_scope("outputs_summary"): tf.summary.image("outputs", converted_outputs) with tf.name_scope("predict_real_summary"): tf.summary.image("predict_real", tf.image.convert_image_dtype(model.predict_real, dtype=tf.uint8)) with tf.name_scope("predict_fake_summary"): tf.summary.image("predict_fake", tf.image.convert_image_dtype(model.predict_fake, dtype=tf.uint8)) tf.summary.scalar("discriminator_loss", model.discrim_loss) tf.summary.scalar("generator_loss_GAN", model.gen_loss_GAN) tf.summary.scalar("generator_loss_L1", model.gen_loss_L1) for var in tf.trainable_variables(): tf.summary.histogram(var.op.name + "/values", var) for grad, var in model.discrim_grads_and_vars + model.gen_grads_and_vars: tf.summary.histogram(var.op.name + "/gradients", grad) with tf.name_scope("parameter_count"): parameter_count = tf.reduce_sum([tf.reduce_prod(tf.shape(v)) for v in tf.trainable_variables()]) #只保存最后一代的模型 saver = tf.train.Saver(max_to_keep=1) logdir = a.output_dir if (a.trace_freq > 0 or a.summary_freq > 0) else None sv = tf.train.Supervisor(logdir=logdir, save_summaries_secs=0, saver=None) loadExamplesCreateModelStop = time.time() #add by Tony loadingModelStart = time.time() with sv.managed_session() as sess: print("parameter_count =", sess.run(parameter_count)) if a.checkpoint is not None: print("loading model from checkpoint") #可以使用tf.train.latest_checkpoint()来自动获取最后一次保存的模型 checkpoint = tf.train.latest_checkpoint(a.checkpoint) #模型的恢复用的是restore()函数,它需要两个参数restore(sess, save_path),save_path指的是保存的模型路径 saver.restore(sess, checkpoint) max_steps = 2**32 # a.max_epochs = number of training epochs if a.max_epochs is not None: max_steps = examples.steps_per_epoch * a.max_epochs # a.max_steps = number of training steps if a.max_steps is not None: max_steps = a.max_steps loadingModelStop = time.time() if a.mode == "test": # testing # at most, process the test data once #add by Tony testStart = time.time() start = time.time() max_steps = min(examples.steps_per_epoch, max_steps) for step in range(max_steps): results = sess.run(display_fetches) #把生成的图片放到相应的目录中 filesets = save_images(results) for i, f in enumerate(filesets): print("evaluated image", f["name"]) index_path = append_index(filesets) print("wrote index at", index_path) print("rate", (time.time() - start) / max_steps) testStop = time.time() #add by Tony : merge all small image to one #把test生成的小的图片合成大的图片 mergeStart = time.time() import mergeAllImages if __name__ == "__main__": mergeAllImages.main() #把生成的图片文件复制到指定的output_file shutil.copy("facades_test/merged3.png", a.output_file) mergeStop = time.time() #step1 time #print("★★★step1 Time used :",str(step1Stop-step1Start) + "秒") #loadExamplesCreateModelStop time print("★★★loadExamplesCreateModel Time used :",str(loadExamplesCreateModelStop-loadExamplesCreateModelStart) + "秒") #split time print("(Split Time used :",str(splitStop-splitStart) + "秒") #loading Model time print("★★★loading Model time used :",str(loadingModelStop-loadingModelStart) + "秒") #test time print("★★★test Time used :",str(testStop-testStart) + "秒") #merge time print("★★★Merge Time used :",str(mergeStop-mergeStart) + "秒") #all time elapsed = (time.clock() - mainStart) print("★★★Time used(Total) : ",str(elapsed) + "秒") else: # training start = time.time() for step in range(max_steps): def should(freq): return freq > 0 and ((step + 1) % freq == 0 or step == max_steps - 1) options = None run_metadata = None if should(a.trace_freq): options = tf.RunOptions(trace_level=tf.RunOptions.FULL_TRACE) run_metadata = tf.RunMetadata() fetches = { "train": model.train, "global_step": sv.global_step, } # display progress every 50 steps if should(a.progress_freq): fetches["discrim_loss"] = model.discrim_loss fetches["gen_loss_GAN"] = model.gen_loss_GAN fetches["gen_loss_L1"] = model.gen_loss_L1 # update summaries every 100 steps if should(a.summary_freq): fetches["summary"] = sv.summary_op # write current training images every 0 steps if should(a.display_freq): fetches["display"] = display_fetches results = sess.run(fetches, options=options, run_metadata=run_metadata) if should(a.summary_freq): print("recording summary") sv.summary_writer.add_summary(results["summary"], results["global_step"]) if should(a.display_freq): print("saving display images") filesets = save_images(results["display"], step=results["global_step"]) append_index(filesets, step=True) if should(a.trace_freq): print("recording trace") sv.summary_writer.add_run_metadata(run_metadata, "step_%d" % results["global_step"]) if should(a.progress_freq): # global_step will have the correct step count if we resume from a checkpoint train_epoch = math.ceil(results["global_step"] / examples.steps_per_epoch) train_step = (results["global_step"] - 1) % examples.steps_per_epoch + 1 rate = (step + 1) * a.batch_size / (time.time() - start) remaining = (max_steps - step) * a.batch_size / rate print("progress epoch %d step %d image/sec %0.1f remaining %dm" % (train_epoch, train_step, rate, remaining / 60)) print("discrim_loss", results["discrim_loss"]) print("gen_loss_GAN", results["gen_loss_GAN"]) print("gen_loss_L1", results["gen_loss_L1"]) # 每隔save_freq(默认为5000)保存Model if should(a.save_freq): print("saving model") # 保存训练好的模型 第二个参数设定保存的路径和名字 第三个参数将训练的次数作为后缀加入到模型名字中 # saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000' saver.save(sess, os.path.join(a.output_dir, "model"), global_step=sv.global_step) if sv.should_stop(): break